tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

NSQ_del_dec_neon_intr.c (65908B)


      1 /***********************************************************************
      2 Copyright (c) 2017 Google Inc.
      3 Redistribution and use in source and binary forms, with or without
      4 modification, are permitted provided that the following conditions
      5 are met:
      6 - Redistributions of source code must retain the above copyright notice,
      7 this list of conditions and the following disclaimer.
      8 - Redistributions in binary form must reproduce the above copyright
      9 notice, this list of conditions and the following disclaimer in the
     10 documentation and/or other materials provided with the distribution.
     11 - Neither the name of Internet Society, IETF or IETF Trust, nor the
     12 names of specific contributors, may be used to endorse or promote
     13 products derived from this software without specific prior written
     14 permission.
     15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
     16 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     17 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     18 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
     19 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     20 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     21 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     22 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     23 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     24 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     25 POSSIBILITY OF SUCH DAMAGE.
     26 ***********************************************************************/
     27 
     28 #ifdef HAVE_CONFIG_H
     29 #include "config.h"
     30 #endif
     31 
     32 #include <arm_neon.h>
     33 #ifdef OPUS_CHECK_ASM
     34 # include <string.h>
     35 #endif
     36 #include "main.h"
     37 #include "stack_alloc.h"
     38 #include "os_support.h"
     39 
     40 /* NEON intrinsics optimization now can only parallelize up to 4 delay decision states.    */
     41 /* If there are more states, C function is called, and this optimization must be expanded. */
     42 #define NEON_MAX_DEL_DEC_STATES 4
     43 
     44 typedef struct {
     45    opus_int32 sLPC_Q14[ MAX_SUB_FRAME_LENGTH + NSQ_LPC_BUF_LENGTH ][ NEON_MAX_DEL_DEC_STATES ];
     46    opus_int32 RandState[ DECISION_DELAY ][     NEON_MAX_DEL_DEC_STATES ];
     47    opus_int32 Q_Q10[     DECISION_DELAY ][     NEON_MAX_DEL_DEC_STATES ];
     48    opus_int32 Xq_Q14[    DECISION_DELAY ][     NEON_MAX_DEL_DEC_STATES ];
     49    opus_int32 Pred_Q15[  DECISION_DELAY ][     NEON_MAX_DEL_DEC_STATES ];
     50    opus_int32 Shape_Q14[ DECISION_DELAY ][     NEON_MAX_DEL_DEC_STATES ];
     51    opus_int32 sAR2_Q14[ MAX_SHAPE_LPC_ORDER ][ NEON_MAX_DEL_DEC_STATES ];
     52    opus_int32 LF_AR_Q14[ NEON_MAX_DEL_DEC_STATES ];
     53    opus_int32 Diff_Q14[  NEON_MAX_DEL_DEC_STATES ];
     54    opus_int32 Seed[      NEON_MAX_DEL_DEC_STATES ];
     55    opus_int32 SeedInit[  NEON_MAX_DEL_DEC_STATES ];
     56    opus_int32 RD_Q10[    NEON_MAX_DEL_DEC_STATES ];
     57 } NSQ_del_decs_struct;
     58 
     59 typedef struct {
     60    opus_int32 Q_Q10[        NEON_MAX_DEL_DEC_STATES ];
     61    opus_int32 RD_Q10[       NEON_MAX_DEL_DEC_STATES ];
     62    opus_int32 xq_Q14[       NEON_MAX_DEL_DEC_STATES ];
     63    opus_int32 LF_AR_Q14[    NEON_MAX_DEL_DEC_STATES ];
     64    opus_int32 Diff_Q14[     NEON_MAX_DEL_DEC_STATES ];
     65    opus_int32 sLTP_shp_Q14[ NEON_MAX_DEL_DEC_STATES ];
     66    opus_int32 LPC_exc_Q14[  NEON_MAX_DEL_DEC_STATES ];
     67 } NSQ_samples_struct;
     68 
     69 static OPUS_INLINE void silk_nsq_del_dec_scale_states_neon(
     70    const silk_encoder_state *psEncC,               /* I    Encoder State                       */
     71    silk_nsq_state      *NSQ,                       /* I/O  NSQ state                           */
     72    NSQ_del_decs_struct psDelDec[],                 /* I/O  Delayed decision states             */
     73    const opus_int16    x16[],                      /* I    Input                               */
     74    opus_int32          x_sc_Q10[],                 /* O    Input scaled with 1/Gain in Q10     */
     75    const opus_int16    sLTP[],                     /* I    Re-whitened LTP state in Q0         */
     76    opus_int32          sLTP_Q15[],                 /* O    LTP state matching scaled input     */
     77    opus_int            subfr,                      /* I    Subframe number                     */
     78    const opus_int      LTP_scale_Q14,              /* I    LTP state scaling                   */
     79    const opus_int32    Gains_Q16[ MAX_NB_SUBFR ],  /* I                                        */
     80    const opus_int      pitchL[ MAX_NB_SUBFR ],     /* I    Pitch lag                           */
     81    const opus_int      signal_type,                /* I    Signal type                         */
     82    const opus_int      decisionDelay               /* I    Decision delay                      */
     83 );
     84 
     85 /******************************************/
     86 /* Noise shape quantizer for one subframe */
     87 /******************************************/
     88 static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_neon(
     89    silk_nsq_state      *NSQ,                   /* I/O  NSQ state                           */
     90    NSQ_del_decs_struct psDelDec[],             /* I/O  Delayed decision states             */
     91    opus_int            signalType,             /* I    Signal type                         */
     92    const opus_int32    x_Q10[],                /* I                                        */
     93    opus_int8           pulses[],               /* O                                        */
     94    opus_int16          xq[],                   /* O                                        */
     95    opus_int32          sLTP_Q15[],             /* I/O  LTP filter state                    */
     96    opus_int32          delayedGain_Q10[],      /* I/O  Gain delay buffer                   */
     97    const opus_int16    a_Q12[],                /* I    Short term prediction coefs         */
     98    const opus_int16    b_Q14[],                /* I    Long term prediction coefs          */
     99    const opus_int16    AR_shp_Q13[],           /* I    Noise shaping coefs                 */
    100    opus_int            lag,                    /* I    Pitch lag                           */
    101    opus_int32          HarmShapeFIRPacked_Q14, /* I                                        */
    102    opus_int            Tilt_Q14,               /* I    Spectral tilt                       */
    103    opus_int32          LF_shp_Q14,             /* I                                        */
    104    opus_int32          Gain_Q16,               /* I                                        */
    105    opus_int            Lambda_Q10,             /* I                                        */
    106    opus_int            offset_Q10,             /* I                                        */
    107    opus_int            length,                 /* I    Input length                        */
    108    opus_int            subfr,                  /* I    Subframe number                     */
    109    opus_int            shapingLPCOrder,        /* I    Shaping LPC filter order            */
    110    opus_int            predictLPCOrder,        /* I    Prediction filter order             */
    111    opus_int            warping_Q16,            /* I                                        */
    112    opus_int            nStatesDelayedDecision, /* I    Number of states in decision tree   */
    113    opus_int            *smpl_buf_idx,          /* I/O  Index to newest samples in buffers  */
    114    opus_int            decisionDelay           /* I                                        */
    115 );
    116 
    117 static OPUS_INLINE void copy_winner_state_kernel(
    118    const NSQ_del_decs_struct *psDelDec,
    119    const opus_int            offset,
    120    const opus_int            last_smple_idx,
    121    const opus_int            Winner_ind,
    122    const int32x2_t           gain_lo_s32x2,
    123    const int32x2_t           gain_hi_s32x2,
    124    const int32x4_t           shift_s32x4,
    125    int32x4_t                 t0_s32x4,
    126    int32x4_t                 t1_s32x4,
    127    opus_int8 *const          pulses,
    128    opus_int16                *pxq,
    129    silk_nsq_state            *NSQ
    130 )
    131 {
    132    int16x8_t t_s16x8;
    133    int32x4_t o0_s32x4, o1_s32x4;
    134 
    135    t0_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 0 ][ Winner_ind ], t0_s32x4, 0 );
    136    t0_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 1 ][ Winner_ind ], t0_s32x4, 1 );
    137    t0_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 2 ][ Winner_ind ], t0_s32x4, 2 );
    138    t0_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 3 ][ Winner_ind ], t0_s32x4, 3 );
    139    t1_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 4 ][ Winner_ind ], t1_s32x4, 0 );
    140    t1_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 5 ][ Winner_ind ], t1_s32x4, 1 );
    141    t1_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 6 ][ Winner_ind ], t1_s32x4, 2 );
    142    t1_s32x4 = vld1q_lane_s32( &psDelDec->Q_Q10[ last_smple_idx - 7 ][ Winner_ind ], t1_s32x4, 3 );
    143    t_s16x8 = vcombine_s16( vrshrn_n_s32( t0_s32x4, 10 ), vrshrn_n_s32( t1_s32x4, 10 ) );
    144    vst1_s8( &pulses[ offset ], vmovn_s16( t_s16x8 ) );
    145 
    146    t0_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 0 ][ Winner_ind ], t0_s32x4, 0 );
    147    t0_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 1 ][ Winner_ind ], t0_s32x4, 1 );
    148    t0_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 2 ][ Winner_ind ], t0_s32x4, 2 );
    149    t0_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 3 ][ Winner_ind ], t0_s32x4, 3 );
    150    t1_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 4 ][ Winner_ind ], t1_s32x4, 0 );
    151    t1_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 5 ][ Winner_ind ], t1_s32x4, 1 );
    152    t1_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 6 ][ Winner_ind ], t1_s32x4, 2 );
    153    t1_s32x4 = vld1q_lane_s32( &psDelDec->Xq_Q14[ last_smple_idx - 7 ][ Winner_ind ], t1_s32x4, 3 );
    154    o0_s32x4 = vqdmulhq_lane_s32( t0_s32x4, gain_lo_s32x2, 0 );
    155    o1_s32x4 = vqdmulhq_lane_s32( t1_s32x4, gain_lo_s32x2, 0 );
    156    o0_s32x4 = vmlaq_lane_s32( o0_s32x4, t0_s32x4, gain_hi_s32x2, 0 );
    157    o1_s32x4 = vmlaq_lane_s32( o1_s32x4, t1_s32x4, gain_hi_s32x2, 0 );
    158    o0_s32x4 = vrshlq_s32( o0_s32x4, shift_s32x4 );
    159    o1_s32x4 = vrshlq_s32( o1_s32x4, shift_s32x4 );
    160    vst1_s16( &pxq[ offset + 0 ], vqmovn_s32( o0_s32x4 ) );
    161    vst1_s16( &pxq[ offset + 4 ], vqmovn_s32( o1_s32x4 ) );
    162 
    163    t0_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 0 ][ Winner_ind ], t0_s32x4, 0 );
    164    t0_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 1 ][ Winner_ind ], t0_s32x4, 1 );
    165    t0_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 2 ][ Winner_ind ], t0_s32x4, 2 );
    166    t0_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 3 ][ Winner_ind ], t0_s32x4, 3 );
    167    t1_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 4 ][ Winner_ind ], t1_s32x4, 0 );
    168    t1_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 5 ][ Winner_ind ], t1_s32x4, 1 );
    169    t1_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 6 ][ Winner_ind ], t1_s32x4, 2 );
    170    t1_s32x4 = vld1q_lane_s32( &psDelDec->Shape_Q14[ last_smple_idx - 7 ][ Winner_ind ], t1_s32x4, 3 );
    171    vst1q_s32( &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx + offset + 0 ], t0_s32x4 );
    172    vst1q_s32( &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx + offset + 4 ], t1_s32x4 );
    173 }
    174 
    175 static OPUS_INLINE void copy_winner_state(
    176    const NSQ_del_decs_struct *psDelDec,
    177    const opus_int            decisionDelay,
    178    const opus_int            smpl_buf_idx,
    179    const opus_int            Winner_ind,
    180    const opus_int32          gain,
    181    const opus_int32          shift,
    182    opus_int8 *const          pulses,
    183    opus_int16                *pxq,
    184    silk_nsq_state            *NSQ
    185 )
    186 {
    187    opus_int        i, last_smple_idx;
    188    const int32x2_t gain_lo_s32x2 = vdup_n_s32( silk_LSHIFT32( gain & 0x0000FFFF, 15 ) );
    189    const int32x2_t gain_hi_s32x2 = vdup_n_s32( gain >> 16 );
    190    const int32x4_t shift_s32x4 = vdupq_n_s32( -shift );
    191    int32x4_t       t0_s32x4, t1_s32x4;
    192 
    193    t0_s32x4 = t1_s32x4 = vdupq_n_s32( 0 ); /* initialization */
    194    last_smple_idx = smpl_buf_idx + decisionDelay - 1 + DECISION_DELAY;
    195    if( last_smple_idx >= DECISION_DELAY ) last_smple_idx -= DECISION_DELAY;
    196    if( last_smple_idx >= DECISION_DELAY ) last_smple_idx -= DECISION_DELAY;
    197 
    198    for( i = 0; ( i < ( decisionDelay - 7 ) ) && ( last_smple_idx >= 7 ); i += 8, last_smple_idx -= 8 ) {
    199        copy_winner_state_kernel( psDelDec, i - decisionDelay, last_smple_idx, Winner_ind, gain_lo_s32x2, gain_hi_s32x2, shift_s32x4, t0_s32x4, t1_s32x4, pulses, pxq, NSQ );
    200    }
    201    for( ; ( i < decisionDelay ) && ( last_smple_idx >= 0 ); i++, last_smple_idx-- ) {
    202        pulses[ i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDelDec->Q_Q10[ last_smple_idx ][ Winner_ind ], 10 );
    203        pxq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( silk_SMULWW( psDelDec->Xq_Q14[ last_smple_idx ][ Winner_ind ], gain ), shift ) );
    204        NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay + i ] = psDelDec->Shape_Q14[ last_smple_idx ][ Winner_ind ];
    205    }
    206 
    207    last_smple_idx += DECISION_DELAY;
    208    for( ; i < ( decisionDelay - 7 ); i++, last_smple_idx-- ) {
    209        copy_winner_state_kernel( psDelDec, i - decisionDelay, last_smple_idx, Winner_ind, gain_lo_s32x2, gain_hi_s32x2, shift_s32x4, t0_s32x4, t1_s32x4, pulses, pxq, NSQ );
    210    }
    211    for( ; i < decisionDelay; i++, last_smple_idx-- ) {
    212        pulses[ i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDelDec->Q_Q10[ last_smple_idx ][ Winner_ind ], 10 );
    213        pxq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( silk_SMULWW( psDelDec->Xq_Q14[ last_smple_idx ][ Winner_ind ], gain ), shift ) );
    214        NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay + i ] = psDelDec->Shape_Q14[ last_smple_idx ][ Winner_ind ];
    215    }
    216 }
    217 
    218 void silk_NSQ_del_dec_neon(
    219    const silk_encoder_state    *psEncC,                                    /* I    Encoder State                   */
    220    silk_nsq_state              *NSQ,                                       /* I/O  NSQ state                       */
    221    SideInfoIndices             *psIndices,                                 /* I/O  Quantization Indices            */
    222    const opus_int16            x16[],                                      /* I    Input                           */
    223    opus_int8                   pulses[],                                   /* O    Quantized pulse signal          */
    224    const opus_int16            *PredCoef_Q12,                              /* I    Short term prediction coefs     */
    225    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],    /* I    Long term prediction coefs      */
    226    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs              */
    227    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],          /* I    Long term shaping coefs         */
    228    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                   /* I    Spectral tilt                   */
    229    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                 /* I    Low frequency shaping coefs     */
    230    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                  /* I    Quantization step sizes         */
    231    const opus_int              pitchL[ MAX_NB_SUBFR ],                     /* I    Pitch lags                      */
    232    const opus_int              Lambda_Q10,                                 /* I    Rate/distortion tradeoff        */
    233    const opus_int              LTP_scale_Q14                               /* I    LTP state scaling               */
    234 )
    235 {
    236 #ifdef OPUS_CHECK_ASM
    237    silk_nsq_state NSQ_c;
    238    SideInfoIndices psIndices_c;
    239    opus_int8 pulses_c[ MAX_FRAME_LENGTH ];
    240    const opus_int8 *const pulses_a = pulses;
    241 
    242    ( void )pulses_a;
    243    silk_memcpy( &NSQ_c, NSQ, sizeof( NSQ_c ) );
    244    silk_memcpy( &psIndices_c, psIndices, sizeof( psIndices_c ) );
    245    silk_memcpy( pulses_c, pulses, sizeof( pulses_c ) );
    246    silk_NSQ_del_dec_c( psEncC, &NSQ_c, &psIndices_c, x16, pulses_c, PredCoef_Q12, LTPCoef_Q14, AR_Q13, HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16,
    247                       pitchL, Lambda_Q10, LTP_scale_Q14 );
    248 #endif
    249 
    250    /* The optimization parallelizes the different delay decision states. */
    251    if(( psEncC->nStatesDelayedDecision > NEON_MAX_DEL_DEC_STATES ) || ( psEncC->nStatesDelayedDecision <= 2 )) {
    252        /* NEON intrinsics optimization now can only parallelize up to 4 delay decision states.    */
    253        /* If there are more states, C function is called, and this optimization must be expanded. */
    254        /* When the number of delay decision states is less than 3, there are penalties using this */
    255        /* optimization, and C function is called.                                                 */
    256        /* When the number of delay decision states is 2, it's better to specialize another        */
    257        /* structure NSQ_del_dec2_struct and optimize with shorter NEON registers. (Low priority)  */
    258        silk_NSQ_del_dec_c( psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, HarmShapeGain_Q14,
    259            Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14 );
    260    } else {
    261        opus_int            i, k, lag, start_idx, LSF_interpolation_flag, Winner_ind, subfr;
    262        opus_int            smpl_buf_idx, decisionDelay;
    263        const opus_int16    *A_Q12, *B_Q14, *AR_shp_Q13;
    264        opus_int16          *pxq;
    265        VARDECL( opus_int32, sLTP_Q15 );
    266        VARDECL( opus_int16, sLTP );
    267        opus_int32          HarmShapeFIRPacked_Q14;
    268        opus_int            offset_Q10;
    269        opus_int32          RDmin_Q10, Gain_Q10;
    270        VARDECL( opus_int32, x_sc_Q10 );
    271        VARDECL( opus_int32, delayedGain_Q10 );
    272        VARDECL( NSQ_del_decs_struct, psDelDec );
    273        int32x4_t           t_s32x4;
    274        SAVE_STACK;
    275 
    276        /* Set unvoiced lag to the previous one, overwrite later for voiced */
    277        lag = NSQ->lagPrev;
    278 
    279        silk_assert( NSQ->prev_gain_Q16 != 0 );
    280 
    281        /* Initialize delayed decision states */
    282        ALLOC( psDelDec, 1, NSQ_del_decs_struct );
    283        OPUS_CLEAR(psDelDec, 1);
    284        /* Only RandState and RD_Q10 need to be initialized to 0. */
    285        silk_memset( psDelDec->RandState, 0, sizeof( psDelDec->RandState ) );
    286        vst1q_s32( psDelDec->RD_Q10, vdupq_n_s32( 0 ) );
    287 
    288        for( k = 0; k < psEncC->nStatesDelayedDecision; k++ ) {
    289            psDelDec->SeedInit[ k ] = psDelDec->Seed[ k ] = ( k + psIndices->Seed ) & 3;
    290        }
    291        vst1q_s32( psDelDec->LF_AR_Q14, vld1q_dup_s32( &NSQ->sLF_AR_shp_Q14 ) );
    292        vst1q_s32( psDelDec->Diff_Q14, vld1q_dup_s32( &NSQ->sDiff_shp_Q14 ) );
    293        vst1q_s32( psDelDec->Shape_Q14[ 0 ], vld1q_dup_s32( &NSQ->sLTP_shp_Q14[ psEncC->ltp_mem_length - 1 ] ) );
    294        for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) {
    295            vst1q_s32( psDelDec->sLPC_Q14[ i ], vld1q_dup_s32( &NSQ->sLPC_Q14[ i ] ) );
    296        }
    297        for( i = 0; i < (opus_int)( sizeof( NSQ->sAR2_Q14 ) / sizeof( NSQ->sAR2_Q14[ 0 ] ) ); i++ ) {
    298            vst1q_s32( psDelDec->sAR2_Q14[ i ], vld1q_dup_s32( &NSQ->sAR2_Q14[ i ] ) );
    299        }
    300 
    301        offset_Q10   = silk_Quantization_Offsets_Q10[ psIndices->signalType >> 1 ][ psIndices->quantOffsetType ];
    302        smpl_buf_idx = 0; /* index of oldest samples */
    303 
    304        decisionDelay = silk_min_int( DECISION_DELAY, psEncC->subfr_length );
    305 
    306        /* For voiced frames limit the decision delay to lower than the pitch lag */
    307        if( psIndices->signalType == TYPE_VOICED ) {
    308            opus_int pitch_min = pitchL[ 0 ];
    309            for( k = 1; k < psEncC->nb_subfr; k++ ) {
    310                pitch_min = silk_min_int( pitch_min, pitchL[ k ] );
    311            }
    312            decisionDelay = silk_min_int( decisionDelay, pitch_min - LTP_ORDER / 2 - 1 );
    313        } else {
    314            if( lag > 0 ) {
    315                decisionDelay = silk_min_int( decisionDelay, lag - LTP_ORDER / 2 - 1 );
    316            }
    317        }
    318 
    319        if( psIndices->NLSFInterpCoef_Q2 == 4 ) {
    320            LSF_interpolation_flag = 0;
    321        } else {
    322            LSF_interpolation_flag = 1;
    323        }
    324 
    325        ALLOC( sLTP_Q15, psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 );
    326        ALLOC( sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16 );
    327        ALLOC( x_sc_Q10, psEncC->subfr_length, opus_int32 );
    328        ALLOC( delayedGain_Q10, DECISION_DELAY, opus_int32 );
    329        /* Set up pointers to start of sub frame */
    330        pxq                   = &NSQ->xq[ psEncC->ltp_mem_length ];
    331        NSQ->sLTP_shp_buf_idx = psEncC->ltp_mem_length;
    332        NSQ->sLTP_buf_idx     = psEncC->ltp_mem_length;
    333        subfr = 0;
    334        for( k = 0; k < psEncC->nb_subfr; k++ ) {
    335            A_Q12      = &PredCoef_Q12[ ( ( k >> 1 ) | ( 1 - LSF_interpolation_flag ) ) * MAX_LPC_ORDER ];
    336            B_Q14      = &LTPCoef_Q14[ k * LTP_ORDER           ];
    337            AR_shp_Q13 = &AR_Q13[     k * MAX_SHAPE_LPC_ORDER ];
    338 
    339            /* Noise shape parameters */
    340            silk_assert( HarmShapeGain_Q14[ k ] >= 0 );
    341            HarmShapeFIRPacked_Q14  =                          silk_RSHIFT( HarmShapeGain_Q14[ k ], 2 );
    342            HarmShapeFIRPacked_Q14 |= silk_LSHIFT( (opus_int32)silk_RSHIFT( HarmShapeGain_Q14[ k ], 1 ), 16 );
    343 
    344            NSQ->rewhite_flag = 0;
    345            if( psIndices->signalType == TYPE_VOICED ) {
    346                /* Voiced */
    347                lag = pitchL[ k ];
    348 
    349                /* Re-whitening */
    350                if( ( k & ( 3 - silk_LSHIFT( LSF_interpolation_flag, 1 ) ) ) == 0 ) {
    351                    if( k == 2 ) {
    352                        /* RESET DELAYED DECISIONS */
    353                        /* Find winner */
    354                        int32x4_t RD_Q10_s32x4;
    355                        RDmin_Q10 = psDelDec->RD_Q10[ 0 ];
    356                        Winner_ind = 0;
    357                        for( i = 1; i < psEncC->nStatesDelayedDecision; i++ ) {
    358                            if( psDelDec->RD_Q10[ i ] < RDmin_Q10 ) {
    359                                RDmin_Q10 = psDelDec->RD_Q10[ i ];
    360                                Winner_ind = i;
    361                            }
    362                        }
    363                        psDelDec->RD_Q10[ Winner_ind ] -= ( silk_int32_MAX >> 4 );
    364                        RD_Q10_s32x4 = vld1q_s32( psDelDec->RD_Q10 );
    365                        RD_Q10_s32x4 = vaddq_s32( RD_Q10_s32x4, vdupq_n_s32( silk_int32_MAX >> 4 ) );
    366                        vst1q_s32( psDelDec->RD_Q10, RD_Q10_s32x4 );
    367 
    368                        /* Copy final part of signals from winner state to output and long-term filter states */
    369                        copy_winner_state( psDelDec, decisionDelay, smpl_buf_idx, Winner_ind, Gains_Q16[ 1 ], 14, pulses, pxq, NSQ );
    370 
    371                        subfr = 0;
    372                    }
    373 
    374                    /* Rewhiten with new A coefs */
    375                    start_idx = psEncC->ltp_mem_length - lag - psEncC->predictLPCOrder - LTP_ORDER / 2;
    376                    silk_assert( start_idx > 0 );
    377 
    378                    silk_LPC_analysis_filter( &sLTP[ start_idx ], &NSQ->xq[ start_idx + k * psEncC->subfr_length ],
    379                        A_Q12, psEncC->ltp_mem_length - start_idx, psEncC->predictLPCOrder, psEncC->arch );
    380 
    381                    NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;
    382                    NSQ->rewhite_flag = 1;
    383                }
    384            }
    385 
    386            silk_nsq_del_dec_scale_states_neon( psEncC, NSQ, psDelDec, x16, x_sc_Q10, sLTP, sLTP_Q15, k,
    387                LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType, decisionDelay );
    388 
    389            silk_noise_shape_quantizer_del_dec_neon( NSQ, psDelDec, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15,
    390                delayedGain_Q10, A_Q12, B_Q14, AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ],
    391                Gains_Q16[ k ], Lambda_Q10, offset_Q10, psEncC->subfr_length, subfr++, psEncC->shapingLPCOrder,
    392                psEncC->predictLPCOrder, psEncC->warping_Q16, psEncC->nStatesDelayedDecision, &smpl_buf_idx, decisionDelay );
    393 
    394            x16    += psEncC->subfr_length;
    395            pulses += psEncC->subfr_length;
    396            pxq    += psEncC->subfr_length;
    397        }
    398 
    399        /* Find winner */
    400        RDmin_Q10 = psDelDec->RD_Q10[ 0 ];
    401        Winner_ind = 0;
    402        for( k = 1; k < psEncC->nStatesDelayedDecision; k++ ) {
    403            if( psDelDec->RD_Q10[ k ] < RDmin_Q10 ) {
    404                RDmin_Q10 = psDelDec->RD_Q10[ k ];
    405                Winner_ind = k;
    406            }
    407        }
    408 
    409        /* Copy final part of signals from winner state to output and long-term filter states */
    410        psIndices->Seed = psDelDec->SeedInit[ Winner_ind ];
    411        Gain_Q10 = silk_RSHIFT32( Gains_Q16[ psEncC->nb_subfr - 1 ], 6 );
    412        copy_winner_state( psDelDec, decisionDelay, smpl_buf_idx, Winner_ind, Gain_Q10, 8, pulses, pxq, NSQ );
    413 
    414        t_s32x4 = vdupq_n_s32( 0 ); /* initialization */
    415        for( i = 0; i < ( NSQ_LPC_BUF_LENGTH - 3 ); i += 4 ) {
    416            t_s32x4 = vld1q_lane_s32( &psDelDec->sLPC_Q14[ i + 0 ][ Winner_ind ], t_s32x4, 0 );
    417            t_s32x4 = vld1q_lane_s32( &psDelDec->sLPC_Q14[ i + 1 ][ Winner_ind ], t_s32x4, 1 );
    418            t_s32x4 = vld1q_lane_s32( &psDelDec->sLPC_Q14[ i + 2 ][ Winner_ind ], t_s32x4, 2 );
    419            t_s32x4 = vld1q_lane_s32( &psDelDec->sLPC_Q14[ i + 3 ][ Winner_ind ], t_s32x4, 3 );
    420            vst1q_s32( &NSQ->sLPC_Q14[ i ], t_s32x4 );
    421        }
    422 
    423        for( ; i < NSQ_LPC_BUF_LENGTH; i++ ) {
    424          NSQ->sLPC_Q14[ i ] = psDelDec->sLPC_Q14[ i ][ Winner_ind ];
    425        }
    426 
    427        for( i = 0; i < (opus_int)( sizeof( NSQ->sAR2_Q14 ) / sizeof( NSQ->sAR2_Q14[ 0 ] ) - 3 ); i += 4 ) {
    428            t_s32x4 = vld1q_lane_s32( &psDelDec->sAR2_Q14[ i + 0 ][ Winner_ind ], t_s32x4, 0 );
    429            t_s32x4 = vld1q_lane_s32( &psDelDec->sAR2_Q14[ i + 1 ][ Winner_ind ], t_s32x4, 1 );
    430            t_s32x4 = vld1q_lane_s32( &psDelDec->sAR2_Q14[ i + 2 ][ Winner_ind ], t_s32x4, 2 );
    431            t_s32x4 = vld1q_lane_s32( &psDelDec->sAR2_Q14[ i + 3 ][ Winner_ind ], t_s32x4, 3 );
    432            vst1q_s32( &NSQ->sAR2_Q14[ i ], t_s32x4 );
    433        }
    434 
    435        for( ; i < (opus_int)( sizeof( NSQ->sAR2_Q14 ) / sizeof( NSQ->sAR2_Q14[ 0 ] ) ); i++ ) {
    436          NSQ->sAR2_Q14[ i ] = psDelDec->sAR2_Q14[ i ][ Winner_ind ];
    437        }
    438 
    439        /* Update states */
    440        NSQ->sLF_AR_shp_Q14 = psDelDec->LF_AR_Q14[ Winner_ind ];
    441        NSQ->sDiff_shp_Q14  = psDelDec->Diff_Q14[ Winner_ind ];
    442        NSQ->lagPrev        = pitchL[ psEncC->nb_subfr - 1 ];
    443 
    444        /* Save quantized speech signal */
    445        silk_memmove( NSQ->xq,           &NSQ->xq[           psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int16 ) );
    446        silk_memmove( NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int32 ) );
    447        RESTORE_STACK;
    448    }
    449 
    450 #ifdef OPUS_CHECK_ASM
    451    silk_assert( !memcmp( &NSQ_c, NSQ, sizeof( NSQ_c ) ) );
    452    silk_assert( !memcmp( &psIndices_c, psIndices, sizeof( psIndices_c ) ) );
    453    silk_assert( !memcmp( pulses_c, pulses_a, sizeof( pulses_c ) ) );
    454 #endif
    455 }
    456 
    457 /******************************************/
    458 /* Noise shape quantizer for one subframe */
    459 /******************************************/
    460 /* Note: Function silk_short_prediction_create_arch_coef_neon() defined in NSQ_neon.h is actually a hacking C function. */
    461 /*       Therefore here we append "_local" to the NEON function name to avoid confusion.                                */
    462 static OPUS_INLINE void silk_short_prediction_create_arch_coef_neon_local(opus_int32 *out, const opus_int16 *in, opus_int order)
    463 {
    464  int16x8_t t_s16x8;
    465  int32x4_t t0_s32x4, t1_s32x4, t2_s32x4, t3_s32x4;
    466  silk_assert( order == 10 || order == 16 );
    467 
    468  t_s16x8 = vld1q_s16( in + 0 );                                                   /* 7 6 5 4  3 2 1 0    */
    469  t_s16x8 = vrev64q_s16( t_s16x8 );                                                /* 4 5 6 7  0 1 2 3    */
    470  t2_s32x4 = vshll_n_s16( vget_high_s16( t_s16x8 ), 15 );                          /* 4 5 6 7             */
    471  t3_s32x4 = vshll_n_s16( vget_low_s16(  t_s16x8 ), 15 );                          /* 0 1 2 3             */
    472 
    473  if( order == 16 ) {
    474      t_s16x8 = vld1q_s16( in + 8 );                                               /* F E D C  B A 9 8    */
    475      t_s16x8 = vrev64q_s16( t_s16x8 );                                            /* C D E F  8 9 A B    */
    476      t0_s32x4 = vshll_n_s16( vget_high_s16( t_s16x8 ), 15 );                      /* C D E F             */
    477      t1_s32x4 = vshll_n_s16( vget_low_s16(  t_s16x8 ), 15 );                      /* 8 9 A B             */
    478  } else {
    479      int16x4_t t_s16x4;
    480 
    481      t0_s32x4 = vdupq_n_s32( 0 );                                                 /* zero zero zero zero */
    482      t_s16x4 = vld1_s16( in + 6 );                                                /* 9    8    7    6    */
    483      t_s16x4 = vrev64_s16( t_s16x4 );                                             /* 6    7    8    9    */
    484      t1_s32x4 = vshll_n_s16( t_s16x4, 15 );
    485      t1_s32x4 = vcombine_s32( vget_low_s32(t0_s32x4), vget_low_s32( t1_s32x4 ) ); /* 8    9    zero zero */
    486  }
    487  vst1q_s32( out +  0, t0_s32x4 );
    488  vst1q_s32( out +  4, t1_s32x4 );
    489  vst1q_s32( out +  8, t2_s32x4 );
    490  vst1q_s32( out + 12, t3_s32x4 );
    491 }
    492 
    493 static OPUS_INLINE int32x4_t silk_SMLAWB_lane0_neon(
    494    const int32x4_t out_s32x4,
    495    const int32x4_t in_s32x4,
    496    const int32x2_t coef_s32x2
    497 )
    498 {
    499    return vaddq_s32( out_s32x4, vqdmulhq_lane_s32( in_s32x4, coef_s32x2, 0 ) );
    500 }
    501 
    502 static OPUS_INLINE int32x4_t silk_SMLAWB_lane1_neon(
    503    const int32x4_t out_s32x4,
    504    const int32x4_t in_s32x4,
    505    const int32x2_t coef_s32x2
    506 )
    507 {
    508    return vaddq_s32( out_s32x4, vqdmulhq_lane_s32( in_s32x4, coef_s32x2, 1 ) );
    509 }
    510 
    511 /* Note: This function has different return value than silk_noise_shape_quantizer_short_prediction_neon(). */
    512 /*       Therefore here we append "_local" to the function name to avoid confusion.                        */
    513 static OPUS_INLINE int32x4_t silk_noise_shape_quantizer_short_prediction_neon_local(const opus_int32 *buf32, const opus_int32 *a_Q12_arch, opus_int order)
    514 {
    515    const int32x4_t a_Q12_arch0_s32x4 = vld1q_s32( a_Q12_arch + 0 );
    516    const int32x4_t a_Q12_arch1_s32x4 = vld1q_s32( a_Q12_arch + 4 );
    517    const int32x4_t a_Q12_arch2_s32x4 = vld1q_s32( a_Q12_arch + 8 );
    518    const int32x4_t a_Q12_arch3_s32x4 = vld1q_s32( a_Q12_arch + 12 );
    519    int32x4_t LPC_pred_Q14_s32x4;
    520 
    521    silk_assert( order == 10 || order == 16 );
    522    /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
    523    LPC_pred_Q14_s32x4 = vdupq_n_s32( silk_RSHIFT( order, 1 ) );
    524    LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 +  0 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32(  a_Q12_arch0_s32x4 ) );
    525    LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 +  1 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32(  a_Q12_arch0_s32x4 ) );
    526    LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 +  2 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch0_s32x4 ) );
    527    LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 +  3 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch0_s32x4 ) );
    528    LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 +  4 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32(  a_Q12_arch1_s32x4 ) );
    529    LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 +  5 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32(  a_Q12_arch1_s32x4 ) );
    530    LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 +  6 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch1_s32x4 ) );
    531    LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 +  7 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch1_s32x4 ) );
    532    LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 +  8 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32(  a_Q12_arch2_s32x4 ) );
    533    LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 +  9 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32(  a_Q12_arch2_s32x4 ) );
    534    LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 10 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch2_s32x4 ) );
    535    LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 11 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch2_s32x4 ) );
    536    LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 12 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32(  a_Q12_arch3_s32x4 ) );
    537    LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 13 * NEON_MAX_DEL_DEC_STATES ), vget_low_s32(  a_Q12_arch3_s32x4 ) );
    538    LPC_pred_Q14_s32x4 = silk_SMLAWB_lane0_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 14 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch3_s32x4 ) );
    539    LPC_pred_Q14_s32x4 = silk_SMLAWB_lane1_neon( LPC_pred_Q14_s32x4, vld1q_s32( buf32 + 15 * NEON_MAX_DEL_DEC_STATES ), vget_high_s32( a_Q12_arch3_s32x4 ) );
    540 
    541    return LPC_pred_Q14_s32x4;
    542 }
    543 
    544 static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_neon(
    545    silk_nsq_state      *NSQ,                   /* I/O  NSQ state                           */
    546    NSQ_del_decs_struct psDelDec[],             /* I/O  Delayed decision states             */
    547    opus_int            signalType,             /* I    Signal type                         */
    548    const opus_int32    x_Q10[],                /* I                                        */
    549    opus_int8           pulses[],               /* O                                        */
    550    opus_int16          xq[],                   /* O                                        */
    551    opus_int32          sLTP_Q15[],             /* I/O  LTP filter state                    */
    552    opus_int32          delayedGain_Q10[],      /* I/O  Gain delay buffer                   */
    553    const opus_int16    a_Q12[],                /* I    Short term prediction coefs         */
    554    const opus_int16    b_Q14[],                /* I    Long term prediction coefs          */
    555    const opus_int16    AR_shp_Q13[],           /* I    Noise shaping coefs                 */
    556    opus_int            lag,                    /* I    Pitch lag                           */
    557    opus_int32          HarmShapeFIRPacked_Q14, /* I                                        */
    558    opus_int            Tilt_Q14,               /* I    Spectral tilt                       */
    559    opus_int32          LF_shp_Q14,             /* I                                        */
    560    opus_int32          Gain_Q16,               /* I                                        */
    561    opus_int            Lambda_Q10,             /* I                                        */
    562    opus_int            offset_Q10,             /* I                                        */
    563    opus_int            length,                 /* I    Input length                        */
    564    opus_int            subfr,                  /* I    Subframe number                     */
    565    opus_int            shapingLPCOrder,        /* I    Shaping LPC filter order            */
    566    opus_int            predictLPCOrder,        /* I    Prediction filter order             */
    567    opus_int            warping_Q16,            /* I                                        */
    568    opus_int            nStatesDelayedDecision, /* I    Number of states in decision tree   */
    569    opus_int            *smpl_buf_idx,          /* I/O  Index to newest samples in buffers  */
    570    opus_int            decisionDelay           /* I                                        */
    571 )
    572 {
    573    opus_int     i, j, k, Winner_ind, RDmin_ind, RDmax_ind, last_smple_idx;
    574    opus_int32   Winner_rand_state;
    575    opus_int32   LTP_pred_Q14, n_LTP_Q14;
    576    opus_int32   RDmin_Q10, RDmax_Q10;
    577    opus_int32   Gain_Q10;
    578    opus_int32   *pred_lag_ptr, *shp_lag_ptr;
    579    opus_int32   a_Q12_arch[MAX_LPC_ORDER];
    580    const int32x2_t warping_Q16_s32x2 = vdup_n_s32( silk_LSHIFT32( warping_Q16, 16 ) >> 1 );
    581    const opus_int32 LF_shp_Q29 = silk_LSHIFT32( LF_shp_Q14, 16 ) >> 1;
    582    opus_int32 AR_shp_Q28[ MAX_SHAPE_LPC_ORDER ];
    583    const uint32x4_t rand_multiplier_u32x4 = vdupq_n_u32( RAND_MULTIPLIER );
    584    const uint32x4_t rand_increment_u32x4 = vdupq_n_u32( RAND_INCREMENT );
    585 
    586    VARDECL( NSQ_samples_struct, psSampleState );
    587    SAVE_STACK;
    588 
    589    silk_assert( nStatesDelayedDecision > 0 );
    590    silk_assert( ( shapingLPCOrder & 1 ) == 0 );   /* check that order is even */
    591    ALLOC( psSampleState, 2, NSQ_samples_struct );
    592    OPUS_CLEAR(psSampleState, 2);
    593 
    594    shp_lag_ptr  = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2 ];
    595    pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ];
    596    Gain_Q10     = silk_RSHIFT( Gain_Q16, 6 );
    597 
    598    for( i = 0; i < ( MAX_SHAPE_LPC_ORDER - 7 ); i += 8 ) {
    599      const int16x8_t t_s16x8 = vld1q_s16( AR_shp_Q13 +  i );
    600      vst1q_s32( AR_shp_Q28 + i + 0, vshll_n_s16( vget_low_s16(  t_s16x8 ), 15 ) );
    601      vst1q_s32( AR_shp_Q28 + i + 4, vshll_n_s16( vget_high_s16( t_s16x8 ), 15 ) );
    602    }
    603 
    604    for( ; i < MAX_SHAPE_LPC_ORDER; i++ ) {
    605      AR_shp_Q28[i] = silk_LSHIFT32( AR_shp_Q13[i], 15 );
    606    }
    607 
    608    silk_short_prediction_create_arch_coef_neon_local( a_Q12_arch, a_Q12, predictLPCOrder );
    609 
    610    for( i = 0; i < length; i++ ) {
    611        int32x4_t Seed_s32x4, LPC_pred_Q14_s32x4;
    612        int32x4_t sign_s32x4, tmp1_s32x4, tmp2_s32x4;
    613        int32x4_t n_AR_Q14_s32x4, n_LF_Q14_s32x4;
    614        int32x2_t AR_shp_Q28_s32x2;
    615        int16x4_t r_Q10_s16x4, rr_Q10_s16x4;
    616 
    617        /* Perform common calculations used in all states */
    618 
    619        /* Long-term prediction */
    620        if( signalType == TYPE_VOICED ) {
    621            /* Unrolled loop */
    622            /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
    623            LTP_pred_Q14 = 2;
    624            LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[  0 ], b_Q14[ 0 ] );
    625            LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[ -1 ], b_Q14[ 1 ] );
    626            LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[ -2 ], b_Q14[ 2 ] );
    627            LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[ -3 ], b_Q14[ 3 ] );
    628            LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[ -4 ], b_Q14[ 4 ] );
    629            LTP_pred_Q14 = silk_LSHIFT( LTP_pred_Q14, 1 );                          /* Q13 -> Q14 */
    630            pred_lag_ptr++;
    631        } else {
    632            LTP_pred_Q14 = 0;
    633        }
    634 
    635        /* Long-term shaping */
    636        if( lag > 0 ) {
    637            /* Symmetric, packed FIR coefficients */
    638            n_LTP_Q14 = silk_SMULWB( silk_ADD32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );
    639            n_LTP_Q14 = silk_SMLAWT( n_LTP_Q14, shp_lag_ptr[ -1 ],                      HarmShapeFIRPacked_Q14 );
    640            n_LTP_Q14 = silk_SUB_LSHIFT32( LTP_pred_Q14, n_LTP_Q14, 2 );            /* Q12 -> Q14 */
    641            shp_lag_ptr++;
    642        } else {
    643            n_LTP_Q14 = 0;
    644        }
    645 
    646        /* Generate dither */
    647        Seed_s32x4 = vld1q_s32( psDelDec->Seed );
    648        Seed_s32x4 = vreinterpretq_s32_u32( vmlaq_u32( rand_increment_u32x4, vreinterpretq_u32_s32( Seed_s32x4 ), rand_multiplier_u32x4 ) );
    649        vst1q_s32( psDelDec->Seed, Seed_s32x4 );
    650 
    651        /* Short-term prediction */
    652        LPC_pred_Q14_s32x4 = silk_noise_shape_quantizer_short_prediction_neon_local(psDelDec->sLPC_Q14[ NSQ_LPC_BUF_LENGTH - 16 + i ], a_Q12_arch, predictLPCOrder);
    653        LPC_pred_Q14_s32x4 = vshlq_n_s32( LPC_pred_Q14_s32x4, 4 ); /* Q10 -> Q14 */
    654 
    655        /* Noise shape feedback */
    656        /* Output of lowpass section */
    657        tmp2_s32x4 = silk_SMLAWB_lane0_neon( vld1q_s32( psDelDec->Diff_Q14 ), vld1q_s32( psDelDec->sAR2_Q14[ 0 ] ), warping_Q16_s32x2 );
    658        /* Output of allpass section */
    659        tmp1_s32x4 = vsubq_s32( vld1q_s32( psDelDec->sAR2_Q14[ 1 ] ), tmp2_s32x4 );
    660        tmp1_s32x4 = silk_SMLAWB_lane0_neon( vld1q_s32( psDelDec->sAR2_Q14[ 0 ] ), tmp1_s32x4, warping_Q16_s32x2 );
    661        vst1q_s32( psDelDec->sAR2_Q14[ 0 ], tmp2_s32x4 );
    662        AR_shp_Q28_s32x2 = vld1_s32( AR_shp_Q28 );
    663        n_AR_Q14_s32x4 = vaddq_s32( vdupq_n_s32( silk_RSHIFT( shapingLPCOrder, 1 ) ), vqdmulhq_lane_s32( tmp2_s32x4, AR_shp_Q28_s32x2, 0 ) );
    664 
    665        /* Loop over allpass sections */
    666        for( j = 2; j < shapingLPCOrder; j += 2 ) {
    667            /* Output of allpass section */
    668            tmp2_s32x4 = vsubq_s32( vld1q_s32( psDelDec->sAR2_Q14[ j + 0 ] ), tmp1_s32x4 );
    669            tmp2_s32x4 = silk_SMLAWB_lane0_neon( vld1q_s32( psDelDec->sAR2_Q14[ j - 1 ] ), tmp2_s32x4, warping_Q16_s32x2 );
    670            vst1q_s32( psDelDec->sAR2_Q14[ j - 1 ], tmp1_s32x4 );
    671            n_AR_Q14_s32x4 = vaddq_s32( n_AR_Q14_s32x4, vqdmulhq_lane_s32( tmp1_s32x4, AR_shp_Q28_s32x2, 1 ) );
    672            /* Output of allpass section */
    673            tmp1_s32x4 = vsubq_s32( vld1q_s32( psDelDec->sAR2_Q14[ j + 1 ] ), tmp2_s32x4 );
    674            tmp1_s32x4 = silk_SMLAWB_lane0_neon( vld1q_s32( psDelDec->sAR2_Q14[ j + 0 ] ), tmp1_s32x4, warping_Q16_s32x2 );
    675            vst1q_s32( psDelDec->sAR2_Q14[ j + 0 ], tmp2_s32x4 );
    676            AR_shp_Q28_s32x2 = vld1_s32( &AR_shp_Q28[ j ] );
    677            n_AR_Q14_s32x4 = vaddq_s32( n_AR_Q14_s32x4, vqdmulhq_lane_s32( tmp2_s32x4, AR_shp_Q28_s32x2, 0 ) );
    678        }
    679        vst1q_s32( psDelDec->sAR2_Q14[ shapingLPCOrder - 1 ], tmp1_s32x4 );
    680        n_AR_Q14_s32x4 = vaddq_s32( n_AR_Q14_s32x4, vqdmulhq_lane_s32( tmp1_s32x4, AR_shp_Q28_s32x2, 1 ) );
    681        n_AR_Q14_s32x4 = vshlq_n_s32( n_AR_Q14_s32x4, 1 );                                                                                        /* Q11 -> Q12 */
    682        n_AR_Q14_s32x4 = vaddq_s32( n_AR_Q14_s32x4, vqdmulhq_n_s32( vld1q_s32( psDelDec->LF_AR_Q14 ), silk_LSHIFT32( Tilt_Q14, 16 ) >> 1 ) );     /* Q12 */
    683        n_AR_Q14_s32x4 = vshlq_n_s32( n_AR_Q14_s32x4, 2 );                                                                                        /* Q12 -> Q14 */
    684        n_LF_Q14_s32x4 = vqdmulhq_n_s32( vld1q_s32( psDelDec->Shape_Q14[ *smpl_buf_idx ] ), LF_shp_Q29 );                                         /* Q12 */
    685        n_LF_Q14_s32x4 = vaddq_s32( n_LF_Q14_s32x4, vqdmulhq_n_s32( vld1q_s32( psDelDec->LF_AR_Q14 ), silk_LSHIFT32( LF_shp_Q14 >> 16 , 15 ) ) ); /* Q12 */
    686        n_LF_Q14_s32x4 = vshlq_n_s32( n_LF_Q14_s32x4, 2 );                                                                                        /* Q12 -> Q14 */
    687 
    688        /* Input minus prediction plus noise feedback                       */
    689        /* r = x[ i ] - LTP_pred - LPC_pred + n_AR + n_Tilt + n_LF + n_LTP  */
    690        tmp1_s32x4 = vaddq_s32( n_AR_Q14_s32x4, n_LF_Q14_s32x4 );               /* Q14 */
    691        tmp2_s32x4 = vaddq_s32( vdupq_n_s32( n_LTP_Q14 ), LPC_pred_Q14_s32x4 ); /* Q13 */
    692        tmp1_s32x4 = vsubq_s32( tmp2_s32x4, tmp1_s32x4 );                       /* Q13 */
    693        tmp1_s32x4 = vrshrq_n_s32( tmp1_s32x4, 4 );                             /* Q10 */
    694        tmp1_s32x4 = vsubq_s32( vdupq_n_s32( x_Q10[ i ] ), tmp1_s32x4 );        /* residual error Q10 */
    695 
    696        /* Flip sign depending on dither */
    697        sign_s32x4 = vreinterpretq_s32_u32( vcltq_s32( Seed_s32x4, vdupq_n_s32( 0 ) ) );
    698        tmp1_s32x4 = veorq_s32( tmp1_s32x4, sign_s32x4 );
    699        tmp1_s32x4 = vsubq_s32( tmp1_s32x4, sign_s32x4 );
    700        tmp1_s32x4 = vmaxq_s32( tmp1_s32x4, vdupq_n_s32( -( 31 << 10 ) ) );
    701        tmp1_s32x4 = vminq_s32( tmp1_s32x4, vdupq_n_s32( 30 << 10 ) );
    702        r_Q10_s16x4 = vmovn_s32( tmp1_s32x4 );
    703 
    704        /* Find two quantization level candidates and measure their rate-distortion */
    705        {
    706            int16x4_t q1_Q10_s16x4 = vsub_s16( r_Q10_s16x4, vdup_n_s16( offset_Q10 ) );
    707            int16x4_t q1_Q0_s16x4 = vshr_n_s16( q1_Q10_s16x4, 10 );
    708            int16x4_t q2_Q10_s16x4;
    709            int32x4_t rd1_Q10_s32x4, rd2_Q10_s32x4;
    710            uint32x4_t t_u32x4;
    711 
    712            if( Lambda_Q10 > 2048 ) {
    713                /* For aggressive RDO, the bias becomes more than one pulse. */
    714                const int rdo_offset = Lambda_Q10/2 - 512;
    715                const uint16x4_t greaterThanRdo = vcgt_s16( q1_Q10_s16x4, vdup_n_s16( rdo_offset ) );
    716                const uint16x4_t lessThanMinusRdo = vclt_s16( q1_Q10_s16x4, vdup_n_s16( -rdo_offset ) );
    717                int16x4_t signed_offset = vbsl_s16( greaterThanRdo, vdup_n_s16( -rdo_offset ), vdup_n_s16( 0 ) );
    718                signed_offset = vbsl_s16( lessThanMinusRdo, vdup_n_s16( rdo_offset ), signed_offset );
    719                /* If Lambda_Q10 > 32767, then q1_Q0, q1_Q10 and q2_Q10 must change to 32-bit. */
    720                silk_assert( Lambda_Q10 <= 32767 );
    721 
    722                q1_Q0_s16x4 = vreinterpret_s16_u16( vclt_s16( q1_Q10_s16x4, vdup_n_s16( 0 ) ) );
    723                q1_Q0_s16x4 = vbsl_s16(vorr_u16(greaterThanRdo, lessThanMinusRdo), vadd_s16( q1_Q10_s16x4 , signed_offset), q1_Q0_s16x4);
    724                q1_Q0_s16x4 = vshr_n_s16( q1_Q0_s16x4, 10 );
    725            }
    726            {
    727                const uint16x4_t equal0_u16x4 = vceq_s16( q1_Q0_s16x4, vdup_n_s16( 0 ) );
    728                const uint16x4_t equalMinus1_u16x4 = vceq_s16( q1_Q0_s16x4, vdup_n_s16( -1 ) );
    729                const uint16x4_t lessThanMinus1_u16x4 = vclt_s16( q1_Q0_s16x4, vdup_n_s16( -1 ) );
    730                int16x4_t tmp1_s16x4, tmp2_s16x4, tmp_summand_s16x4;
    731 
    732                q1_Q10_s16x4 = vshl_n_s16( q1_Q0_s16x4, 10 );
    733                tmp_summand_s16x4 = vand_s16( vreinterpret_s16_u16(vcge_s16(q1_Q0_s16x4, vdup_n_s16(0))), vdup_n_s16( offset_Q10 - QUANT_LEVEL_ADJUST_Q10 ) );
    734                tmp1_s16x4 = vadd_s16( q1_Q10_s16x4, tmp_summand_s16x4 );
    735                tmp_summand_s16x4 = vbsl_s16( lessThanMinus1_u16x4, vdup_n_s16( offset_Q10 + QUANT_LEVEL_ADJUST_Q10 ), vdup_n_s16(0) );
    736                q1_Q10_s16x4 = vadd_s16( q1_Q10_s16x4,  tmp_summand_s16x4);
    737                q1_Q10_s16x4 = vbsl_s16( lessThanMinus1_u16x4, q1_Q10_s16x4, tmp1_s16x4 );
    738                q1_Q10_s16x4 = vbsl_s16( equal0_u16x4, vdup_n_s16( offset_Q10 ), q1_Q10_s16x4 );
    739                q1_Q10_s16x4 = vbsl_s16( equalMinus1_u16x4, vdup_n_s16( offset_Q10 - ( 1024 - QUANT_LEVEL_ADJUST_Q10 ) ), q1_Q10_s16x4 );
    740                q2_Q10_s16x4 = vadd_s16( q1_Q10_s16x4, vdup_n_s16( 1024 ) );
    741                q2_Q10_s16x4 = vbsl_s16( equal0_u16x4, vdup_n_s16( offset_Q10 + 1024 - QUANT_LEVEL_ADJUST_Q10 ), q2_Q10_s16x4 );
    742                q2_Q10_s16x4 = vbsl_s16( equalMinus1_u16x4, vdup_n_s16( offset_Q10 ), q2_Q10_s16x4 );
    743                tmp1_s16x4 = q1_Q10_s16x4;
    744                tmp2_s16x4 = q2_Q10_s16x4;
    745                tmp1_s16x4 = vbsl_s16( vorr_u16( equalMinus1_u16x4, lessThanMinus1_u16x4 ), vneg_s16( tmp1_s16x4 ), tmp1_s16x4 );
    746                tmp2_s16x4 = vbsl_s16( lessThanMinus1_u16x4, vneg_s16( tmp2_s16x4 ), tmp2_s16x4 );
    747                rd1_Q10_s32x4 = vmull_s16( tmp1_s16x4, vdup_n_s16( Lambda_Q10 ) );
    748                rd2_Q10_s32x4 = vmull_s16( tmp2_s16x4, vdup_n_s16( Lambda_Q10 ) );
    749            }
    750 
    751            rr_Q10_s16x4 = vsub_s16( r_Q10_s16x4, q1_Q10_s16x4 );
    752            rd1_Q10_s32x4 = vmlal_s16( rd1_Q10_s32x4, rr_Q10_s16x4, rr_Q10_s16x4 );
    753            rd1_Q10_s32x4 = vshrq_n_s32( rd1_Q10_s32x4, 10 );
    754 
    755            rr_Q10_s16x4 = vsub_s16( r_Q10_s16x4, q2_Q10_s16x4 );
    756            rd2_Q10_s32x4 = vmlal_s16( rd2_Q10_s32x4, rr_Q10_s16x4, rr_Q10_s16x4 );
    757            rd2_Q10_s32x4 = vshrq_n_s32( rd2_Q10_s32x4, 10 );
    758 
    759            tmp2_s32x4 = vld1q_s32( psDelDec->RD_Q10 );
    760            tmp1_s32x4 = vaddq_s32( tmp2_s32x4, vminq_s32( rd1_Q10_s32x4, rd2_Q10_s32x4 ) );
    761            tmp2_s32x4 = vaddq_s32( tmp2_s32x4, vmaxq_s32( rd1_Q10_s32x4, rd2_Q10_s32x4 ) );
    762            vst1q_s32( psSampleState[ 0 ].RD_Q10, tmp1_s32x4 );
    763            vst1q_s32( psSampleState[ 1 ].RD_Q10, tmp2_s32x4 );
    764            t_u32x4 = vcltq_s32( rd1_Q10_s32x4, rd2_Q10_s32x4 );
    765            tmp1_s32x4 = vbslq_s32( t_u32x4, vmovl_s16( q1_Q10_s16x4 ), vmovl_s16( q2_Q10_s16x4 ) );
    766            tmp2_s32x4 = vbslq_s32( t_u32x4, vmovl_s16( q2_Q10_s16x4 ), vmovl_s16( q1_Q10_s16x4 ) );
    767            vst1q_s32( psSampleState[ 0 ].Q_Q10, tmp1_s32x4 );
    768            vst1q_s32( psSampleState[ 1 ].Q_Q10, tmp2_s32x4 );
    769        }
    770 
    771        {
    772            /* Update states for best quantization */
    773            int32x4_t exc_Q14_s32x4, LPC_exc_Q14_s32x4, xq_Q14_s32x4, sLF_AR_shp_Q14_s32x4;
    774 
    775            /* Quantized excitation */
    776            exc_Q14_s32x4 = vshlq_n_s32( tmp1_s32x4, 4 );
    777            exc_Q14_s32x4 = veorq_s32( exc_Q14_s32x4, sign_s32x4 );
    778            exc_Q14_s32x4 = vsubq_s32( exc_Q14_s32x4, sign_s32x4 );
    779 
    780            /* Add predictions */
    781            LPC_exc_Q14_s32x4 = vaddq_s32( exc_Q14_s32x4, vdupq_n_s32( LTP_pred_Q14 ) );
    782            xq_Q14_s32x4      = vaddq_s32( LPC_exc_Q14_s32x4, LPC_pred_Q14_s32x4 );
    783 
    784            /* Update states */
    785            tmp1_s32x4 = vsubq_s32( xq_Q14_s32x4, vshlq_n_s32( vdupq_n_s32( x_Q10[ i ] ), 4 ) );
    786            vst1q_s32( psSampleState[ 0 ].Diff_Q14, tmp1_s32x4 );
    787            sLF_AR_shp_Q14_s32x4 = vsubq_s32( tmp1_s32x4, n_AR_Q14_s32x4 );
    788            vst1q_s32( psSampleState[ 0 ].sLTP_shp_Q14, vsubq_s32( sLF_AR_shp_Q14_s32x4, n_LF_Q14_s32x4 ) );
    789            vst1q_s32( psSampleState[ 0 ].LF_AR_Q14, sLF_AR_shp_Q14_s32x4 );
    790            vst1q_s32( psSampleState[ 0 ].LPC_exc_Q14, LPC_exc_Q14_s32x4 );
    791            vst1q_s32( psSampleState[ 0 ].xq_Q14, xq_Q14_s32x4 );
    792 
    793            /* Quantized excitation */
    794            exc_Q14_s32x4 = vshlq_n_s32( tmp2_s32x4, 4 );
    795            exc_Q14_s32x4 = veorq_s32( exc_Q14_s32x4, sign_s32x4 );
    796            exc_Q14_s32x4 = vsubq_s32( exc_Q14_s32x4, sign_s32x4 );
    797 
    798            /* Add predictions */
    799            LPC_exc_Q14_s32x4 = vaddq_s32( exc_Q14_s32x4, vdupq_n_s32( LTP_pred_Q14 ) );
    800            xq_Q14_s32x4      = vaddq_s32( LPC_exc_Q14_s32x4, LPC_pred_Q14_s32x4 );
    801 
    802            /* Update states */
    803            tmp1_s32x4 = vsubq_s32( xq_Q14_s32x4, vshlq_n_s32( vdupq_n_s32( x_Q10[ i ] ), 4 ) );
    804            vst1q_s32( psSampleState[ 1 ].Diff_Q14, tmp1_s32x4 );
    805            sLF_AR_shp_Q14_s32x4 = vsubq_s32( tmp1_s32x4, n_AR_Q14_s32x4 );
    806            vst1q_s32( psSampleState[ 1 ].sLTP_shp_Q14, vsubq_s32( sLF_AR_shp_Q14_s32x4, n_LF_Q14_s32x4 ) );
    807            vst1q_s32( psSampleState[ 1 ].LF_AR_Q14, sLF_AR_shp_Q14_s32x4 );
    808            vst1q_s32( psSampleState[ 1 ].LPC_exc_Q14, LPC_exc_Q14_s32x4 );
    809            vst1q_s32( psSampleState[ 1 ].xq_Q14, xq_Q14_s32x4 );
    810        }
    811 
    812        *smpl_buf_idx = *smpl_buf_idx ? ( *smpl_buf_idx - 1 ) : ( DECISION_DELAY - 1);
    813        last_smple_idx = *smpl_buf_idx + decisionDelay + DECISION_DELAY;
    814        if( last_smple_idx >= DECISION_DELAY ) last_smple_idx -= DECISION_DELAY;
    815        if( last_smple_idx >= DECISION_DELAY ) last_smple_idx -= DECISION_DELAY;
    816 
    817        /* Find winner */
    818        RDmin_Q10 = psSampleState[ 0 ].RD_Q10[ 0 ];
    819        Winner_ind = 0;
    820        for( k = 1; k < nStatesDelayedDecision; k++ ) {
    821            if( psSampleState[ 0 ].RD_Q10[ k ] < RDmin_Q10 ) {
    822                RDmin_Q10 = psSampleState[ 0 ].RD_Q10[ k ];
    823                Winner_ind = k;
    824            }
    825        }
    826 
    827        /* clear unused part of RD_Q10 to avoid overflows */
    828        if( nStatesDelayedDecision < NEON_MAX_DEL_DEC_STATES )
    829        {
    830            OPUS_CLEAR(psSampleState[0].RD_Q10 + nStatesDelayedDecision, NEON_MAX_DEL_DEC_STATES - nStatesDelayedDecision);
    831            OPUS_CLEAR(psSampleState[1].RD_Q10 + nStatesDelayedDecision, NEON_MAX_DEL_DEC_STATES - nStatesDelayedDecision);
    832        }
    833 
    834        /* Increase RD values of expired states */
    835        {
    836            uint32x4_t t_u32x4;
    837            Winner_rand_state = psDelDec->RandState[ last_smple_idx ][ Winner_ind ];
    838            t_u32x4 = vceqq_s32( vld1q_s32( psDelDec->RandState[ last_smple_idx ] ), vdupq_n_s32( Winner_rand_state ) );
    839            t_u32x4 = vmvnq_u32( t_u32x4 );
    840            t_u32x4 = vshrq_n_u32( t_u32x4, 5 );
    841            tmp1_s32x4 = vld1q_s32( psSampleState[ 0 ].RD_Q10 );
    842            tmp2_s32x4 = vld1q_s32( psSampleState[ 1 ].RD_Q10 );
    843            tmp1_s32x4 = vaddq_s32( tmp1_s32x4, vreinterpretq_s32_u32( t_u32x4 ) );
    844            tmp2_s32x4 = vaddq_s32( tmp2_s32x4, vreinterpretq_s32_u32( t_u32x4 ) );
    845            vst1q_s32( psSampleState[ 0 ].RD_Q10, tmp1_s32x4 );
    846            vst1q_s32( psSampleState[ 1 ].RD_Q10, tmp2_s32x4 );
    847 
    848            /* Find worst in first set and best in second set */
    849            RDmax_Q10 = psSampleState[ 0 ].RD_Q10[ 0 ];
    850            RDmin_Q10 = psSampleState[ 1 ].RD_Q10[ 0 ];
    851            RDmax_ind = 0;
    852            RDmin_ind = 0;
    853            for( k = 1; k < nStatesDelayedDecision; k++ ) {
    854                /* find worst in first set */
    855                if( psSampleState[ 0 ].RD_Q10[ k ] > RDmax_Q10 ) {
    856                    RDmax_Q10 = psSampleState[ 0 ].RD_Q10[ k ];
    857                    RDmax_ind = k;
    858                }
    859                /* find best in second set */
    860                if( psSampleState[ 1 ].RD_Q10[ k ] < RDmin_Q10 ) {
    861                    RDmin_Q10 = psSampleState[ 1 ].RD_Q10[ k ];
    862                    RDmin_ind = k;
    863                }
    864            }
    865        }
    866 
    867        /* Replace a state if best from second set outperforms worst in first set */
    868        if( RDmin_Q10 < RDmax_Q10 ) {
    869            opus_int32 (*ptr)[NEON_MAX_DEL_DEC_STATES] = psDelDec->RandState;
    870            const int numOthers = (int)( ( sizeof( NSQ_del_decs_struct ) - sizeof( ( (NSQ_del_decs_struct *)0 )->sLPC_Q14 ) )
    871                / ( NEON_MAX_DEL_DEC_STATES * sizeof( opus_int32 ) ) );
    872            /* Only ( predictLPCOrder - 1 ) of sLPC_Q14 buffer need to be updated, though the first several     */
    873            /* useless sLPC_Q14[] will be different comparing with C when predictLPCOrder < NSQ_LPC_BUF_LENGTH. */
    874            /* Here just update constant ( NSQ_LPC_BUF_LENGTH - 1 ) for simplicity.                             */
    875            for( j = i + 1; j < i + NSQ_LPC_BUF_LENGTH; j++ ) {
    876                psDelDec->sLPC_Q14[ j ][ RDmax_ind ] = psDelDec->sLPC_Q14[ j ][ RDmin_ind ];
    877            }
    878            for( j = 0; j < numOthers; j++ ) {
    879                ptr[ j ][ RDmax_ind ] = ptr[ j ][ RDmin_ind ];
    880            }
    881 
    882            psSampleState[ 0 ].Q_Q10[ RDmax_ind ] = psSampleState[ 1 ].Q_Q10[ RDmin_ind ];
    883            psSampleState[ 0 ].RD_Q10[ RDmax_ind ] = psSampleState[ 1 ].RD_Q10[ RDmin_ind ];
    884            psSampleState[ 0 ].xq_Q14[ RDmax_ind ] = psSampleState[ 1 ].xq_Q14[ RDmin_ind ];
    885            psSampleState[ 0 ].LF_AR_Q14[ RDmax_ind ] = psSampleState[ 1 ].LF_AR_Q14[ RDmin_ind ];
    886            psSampleState[ 0 ].Diff_Q14[ RDmax_ind ] = psSampleState[ 1 ].Diff_Q14[ RDmin_ind ];
    887            psSampleState[ 0 ].sLTP_shp_Q14[ RDmax_ind ] = psSampleState[ 1 ].sLTP_shp_Q14[ RDmin_ind ];
    888            psSampleState[ 0 ].LPC_exc_Q14[ RDmax_ind ] = psSampleState[ 1 ].LPC_exc_Q14[ RDmin_ind ];
    889        }
    890 
    891        /* Write samples from winner to output and long-term filter states */
    892        if( subfr > 0 || i >= decisionDelay ) {
    893            pulses[  i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDelDec->Q_Q10[ last_smple_idx ][ Winner_ind ], 10 );
    894            xq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND(
    895                silk_SMULWW( psDelDec->Xq_Q14[ last_smple_idx ][ Winner_ind ], delayedGain_Q10[ last_smple_idx ] ), 8 ) );
    896            NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay ] = psDelDec->Shape_Q14[ last_smple_idx ][ Winner_ind ];
    897            sLTP_Q15[          NSQ->sLTP_buf_idx     - decisionDelay ] = psDelDec->Pred_Q15[  last_smple_idx ][ Winner_ind ];
    898        }
    899        NSQ->sLTP_shp_buf_idx++;
    900        NSQ->sLTP_buf_idx++;
    901 
    902        /* Update states */
    903        vst1q_s32( psDelDec->LF_AR_Q14, vld1q_s32( psSampleState[ 0 ].LF_AR_Q14 ) );
    904        vst1q_s32( psDelDec->Diff_Q14, vld1q_s32( psSampleState[ 0 ].Diff_Q14 ) );
    905        vst1q_s32( psDelDec->sLPC_Q14[ NSQ_LPC_BUF_LENGTH + i ], vld1q_s32( psSampleState[ 0 ].xq_Q14 ) );
    906        vst1q_s32( psDelDec->Xq_Q14[ *smpl_buf_idx ], vld1q_s32( psSampleState[ 0 ].xq_Q14 ) );
    907        tmp1_s32x4 = vld1q_s32( psSampleState[ 0 ].Q_Q10 );
    908        vst1q_s32( psDelDec->Q_Q10[ *smpl_buf_idx ], tmp1_s32x4 );
    909        vst1q_s32( psDelDec->Pred_Q15[ *smpl_buf_idx ], vshlq_n_s32( vld1q_s32( psSampleState[ 0 ].LPC_exc_Q14 ), 1 ) );
    910        vst1q_s32( psDelDec->Shape_Q14[ *smpl_buf_idx ], vld1q_s32( psSampleState[ 0 ].sLTP_shp_Q14 ) );
    911        tmp1_s32x4 = vrshrq_n_s32( tmp1_s32x4, 10 );
    912        tmp1_s32x4 = vreinterpretq_s32_u32( vaddq_u32( vreinterpretq_u32_s32(
    913            vld1q_s32( psDelDec->Seed ) ), vreinterpretq_u32_s32( tmp1_s32x4 ) ) );
    914        vst1q_s32( psDelDec->Seed, tmp1_s32x4 );
    915        vst1q_s32( psDelDec->RandState[ *smpl_buf_idx ], tmp1_s32x4 );
    916        vst1q_s32( psDelDec->RD_Q10, vld1q_s32( psSampleState[ 0 ].RD_Q10 ) );
    917        delayedGain_Q10[ *smpl_buf_idx ] = Gain_Q10;
    918    }
    919    /* Update LPC states */
    920    silk_memcpy( psDelDec->sLPC_Q14[ 0 ], psDelDec->sLPC_Q14[ length ], NEON_MAX_DEL_DEC_STATES * NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) );
    921 
    922    RESTORE_STACK;
    923 }
    924 
    925 static OPUS_INLINE void silk_SMULWB_8_neon(
    926    const opus_int16 *a,
    927    const int32x2_t  b,
    928    opus_int32       *o
    929 )
    930 {
    931    const int16x8_t a_s16x8 = vld1q_s16( a );
    932    int32x4_t o0_s32x4, o1_s32x4;
    933 
    934    o0_s32x4 = vshll_n_s16( vget_low_s16( a_s16x8 ), 15 );
    935    o1_s32x4 = vshll_n_s16( vget_high_s16( a_s16x8 ), 15 );
    936    o0_s32x4 = vqdmulhq_lane_s32( o0_s32x4, b, 0 );
    937    o1_s32x4 = vqdmulhq_lane_s32( o1_s32x4, b, 0 );
    938    vst1q_s32( o, o0_s32x4 );
    939    vst1q_s32( o + 4, o1_s32x4 );
    940 }
    941 
    942 /* Only works when ( b >= -65536 ) && ( b < 65536 ). */
    943 static OPUS_INLINE void silk_SMULWW_small_b_4_neon(
    944    opus_int32       *a,
    945    const int32x2_t  b_s32x2)
    946 {
    947    int32x4_t o_s32x4;
    948 
    949    o_s32x4 = vld1q_s32( a );
    950    o_s32x4 = vqdmulhq_lane_s32( o_s32x4, b_s32x2, 0 );
    951    vst1q_s32( a, o_s32x4 );
    952 }
    953 
    954 /* Only works when ( b >= -65536 ) && ( b < 65536 ). */
    955 static OPUS_INLINE void silk_SMULWW_small_b_8_neon(
    956    opus_int32       *a,
    957    const int32x2_t  b_s32x2
    958 )
    959 {
    960    int32x4_t o0_s32x4, o1_s32x4;
    961 
    962    o0_s32x4 = vld1q_s32( a );
    963    o1_s32x4 = vld1q_s32( a + 4 );
    964    o0_s32x4 = vqdmulhq_lane_s32( o0_s32x4, b_s32x2, 0 );
    965    o1_s32x4 = vqdmulhq_lane_s32( o1_s32x4, b_s32x2, 0 );
    966    vst1q_s32( a, o0_s32x4 );
    967    vst1q_s32( a + 4, o1_s32x4 );
    968 }
    969 
    970 static OPUS_INLINE void silk_SMULWW_4_neon(
    971    opus_int32       *a,
    972    const int32x2_t  b_s32x2)
    973 {
    974    int32x4_t a_s32x4, o_s32x4;
    975 
    976    a_s32x4 = vld1q_s32( a );
    977    o_s32x4 = vqdmulhq_lane_s32( a_s32x4, b_s32x2, 0 );
    978    o_s32x4 = vmlaq_lane_s32( o_s32x4, a_s32x4, b_s32x2, 1 );
    979    vst1q_s32( a, o_s32x4 );
    980 }
    981 
    982 static OPUS_INLINE void silk_SMULWW_8_neon(
    983    opus_int32       *a,
    984    const int32x2_t  b_s32x2
    985 )
    986 {
    987    int32x4_t a0_s32x4, a1_s32x4, o0_s32x4, o1_s32x4;
    988 
    989    a0_s32x4 = vld1q_s32( a );
    990    a1_s32x4 = vld1q_s32( a + 4 );
    991    o0_s32x4 = vqdmulhq_lane_s32( a0_s32x4, b_s32x2, 0 );
    992    o1_s32x4 = vqdmulhq_lane_s32( a1_s32x4, b_s32x2, 0 );
    993    o0_s32x4 = vmlaq_lane_s32( o0_s32x4, a0_s32x4, b_s32x2, 1 );
    994    o1_s32x4 = vmlaq_lane_s32( o1_s32x4, a1_s32x4, b_s32x2, 1 );
    995    vst1q_s32( a, o0_s32x4 );
    996    vst1q_s32( a + 4, o1_s32x4 );
    997 }
    998 
    999 static OPUS_INLINE void silk_SMULWW_loop_neon(
   1000    const opus_int16 *a,
   1001    const opus_int32 b,
   1002    opus_int32       *o,
   1003    const opus_int   loop_num
   1004 )
   1005 {
   1006    opus_int i;
   1007    int32x2_t b_s32x2;
   1008 
   1009    b_s32x2 = vdup_n_s32( b );
   1010    for( i = 0; i < loop_num - 7; i += 8 ) {
   1011        silk_SMULWB_8_neon( a + i, b_s32x2, o + i );
   1012    }
   1013    for( ; i < loop_num; i++ ) {
   1014        o[ i ] = silk_SMULWW( a[ i ], b );
   1015    }
   1016 }
   1017 
   1018 static OPUS_INLINE void silk_nsq_del_dec_scale_states_neon(
   1019    const silk_encoder_state *psEncC,               /* I    Encoder State                       */
   1020    silk_nsq_state      *NSQ,                       /* I/O  NSQ state                           */
   1021    NSQ_del_decs_struct psDelDec[],                 /* I/O  Delayed decision states             */
   1022    const opus_int16    x16[],                      /* I    Input                               */
   1023    opus_int32          x_sc_Q10[],                 /* O    Input scaled with 1/Gain in Q10     */
   1024    const opus_int16    sLTP[],                     /* I    Re-whitened LTP state in Q0         */
   1025    opus_int32          sLTP_Q15[],                 /* O    LTP state matching scaled input     */
   1026    opus_int            subfr,                      /* I    Subframe number                     */
   1027    const opus_int      LTP_scale_Q14,              /* I    LTP state scaling                   */
   1028    const opus_int32    Gains_Q16[ MAX_NB_SUBFR ],  /* I                                        */
   1029    const opus_int      pitchL[ MAX_NB_SUBFR ],     /* I    Pitch lag                           */
   1030    const opus_int      signal_type,                /* I    Signal type                         */
   1031    const opus_int      decisionDelay               /* I    Decision delay                      */
   1032 )
   1033 {
   1034    opus_int            i, lag;
   1035    opus_int32          gain_adj_Q16, inv_gain_Q31, inv_gain_Q26;
   1036 
   1037    lag          = pitchL[ subfr ];
   1038    inv_gain_Q31 = silk_INVERSE32_varQ( silk_max( Gains_Q16[ subfr ], 1 ), 47 );
   1039    silk_assert( inv_gain_Q31 != 0 );
   1040 
   1041    /* Scale input */
   1042    inv_gain_Q26 = silk_RSHIFT_ROUND( inv_gain_Q31, 5 );
   1043    silk_SMULWW_loop_neon( x16, inv_gain_Q26, x_sc_Q10, psEncC->subfr_length );
   1044 
   1045    /* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */
   1046    if( NSQ->rewhite_flag ) {
   1047        if( subfr == 0 ) {
   1048            /* Do LTP downscaling */
   1049            inv_gain_Q31 = silk_LSHIFT( silk_SMULWB( inv_gain_Q31, LTP_scale_Q14 ), 2 );
   1050        }
   1051        silk_SMULWW_loop_neon( sLTP + NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2, inv_gain_Q31, sLTP_Q15 + NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2, lag + LTP_ORDER / 2 );
   1052    }
   1053 
   1054    /* Adjust for changing gain */
   1055    if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) {
   1056        int32x2_t gain_adj_Q16_s32x2;
   1057        gain_adj_Q16 = silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 );
   1058 
   1059        /* Scale long-term shaping state */
   1060        if( ( gain_adj_Q16 >= -65536 ) && ( gain_adj_Q16 < 65536 ) ) {
   1061            gain_adj_Q16_s32x2 = vdup_n_s32( silk_LSHIFT32( gain_adj_Q16, 15 ) );
   1062            for( i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sLTP_shp_buf_idx - 7; i += 8 ) {
   1063                silk_SMULWW_small_b_8_neon( NSQ->sLTP_shp_Q14 + i, gain_adj_Q16_s32x2 );
   1064            }
   1065            for( ; i < NSQ->sLTP_shp_buf_idx; i++ ) {
   1066                NSQ->sLTP_shp_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sLTP_shp_Q14[ i ] );
   1067            }
   1068 
   1069            /* Scale long-term prediction state */
   1070            if( signal_type == TYPE_VOICED && NSQ->rewhite_flag == 0 ) {
   1071                for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx - decisionDelay - 7; i += 8 ) {
   1072                    silk_SMULWW_small_b_8_neon( sLTP_Q15 + i, gain_adj_Q16_s32x2 );
   1073                }
   1074                for( ; i < NSQ->sLTP_buf_idx - decisionDelay; i++ ) {
   1075                    sLTP_Q15[ i ] = silk_SMULWW( gain_adj_Q16, sLTP_Q15[ i ] );
   1076                }
   1077            }
   1078 
   1079            /* Scale scalar states */
   1080            silk_SMULWW_small_b_4_neon( psDelDec->LF_AR_Q14, gain_adj_Q16_s32x2 );
   1081            silk_SMULWW_small_b_4_neon( psDelDec->Diff_Q14,  gain_adj_Q16_s32x2 );
   1082 
   1083            /* Scale short-term prediction and shaping states */
   1084            for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) {
   1085                silk_SMULWW_small_b_4_neon( psDelDec->sLPC_Q14[ i ], gain_adj_Q16_s32x2 );
   1086            }
   1087 
   1088            for( i = 0; i < MAX_SHAPE_LPC_ORDER; i++ ) {
   1089                silk_SMULWW_small_b_4_neon( psDelDec->sAR2_Q14[ i ], gain_adj_Q16_s32x2 );
   1090            }
   1091 
   1092            for( i = 0; i < DECISION_DELAY; i++ ) {
   1093                silk_SMULWW_small_b_4_neon( psDelDec->Pred_Q15[  i ], gain_adj_Q16_s32x2 );
   1094                silk_SMULWW_small_b_4_neon( psDelDec->Shape_Q14[ i ], gain_adj_Q16_s32x2 );
   1095            }
   1096        } else {
   1097            gain_adj_Q16_s32x2 = vdup_n_s32( silk_LSHIFT32( gain_adj_Q16 & 0x0000FFFF, 15 ) );
   1098            gain_adj_Q16_s32x2 = vset_lane_s32( gain_adj_Q16 >> 16, gain_adj_Q16_s32x2, 1 );
   1099            for( i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sLTP_shp_buf_idx - 7; i += 8 ) {
   1100                silk_SMULWW_8_neon( NSQ->sLTP_shp_Q14 + i, gain_adj_Q16_s32x2 );
   1101            }
   1102            for( ; i < NSQ->sLTP_shp_buf_idx; i++ ) {
   1103                NSQ->sLTP_shp_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sLTP_shp_Q14[ i ] );
   1104            }
   1105 
   1106            /* Scale long-term prediction state */
   1107            if( signal_type == TYPE_VOICED && NSQ->rewhite_flag == 0 ) {
   1108                for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx - decisionDelay - 7; i += 8 ) {
   1109                    silk_SMULWW_8_neon( sLTP_Q15 + i, gain_adj_Q16_s32x2 );
   1110                }
   1111                for( ; i < NSQ->sLTP_buf_idx - decisionDelay; i++ ) {
   1112                    sLTP_Q15[ i ] = silk_SMULWW( gain_adj_Q16, sLTP_Q15[ i ] );
   1113                }
   1114            }
   1115 
   1116            /* Scale scalar states */
   1117            silk_SMULWW_4_neon( psDelDec->LF_AR_Q14, gain_adj_Q16_s32x2 );
   1118            silk_SMULWW_4_neon( psDelDec->Diff_Q14,  gain_adj_Q16_s32x2 );
   1119 
   1120            /* Scale short-term prediction and shaping states */
   1121            for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) {
   1122                silk_SMULWW_4_neon( psDelDec->sLPC_Q14[ i ], gain_adj_Q16_s32x2 );
   1123            }
   1124 
   1125            for( i = 0; i < MAX_SHAPE_LPC_ORDER; i++ ) {
   1126                silk_SMULWW_4_neon( psDelDec->sAR2_Q14[ i ], gain_adj_Q16_s32x2 );
   1127            }
   1128 
   1129            for( i = 0; i < DECISION_DELAY; i++ ) {
   1130                silk_SMULWW_4_neon( psDelDec->Pred_Q15[  i ], gain_adj_Q16_s32x2 );
   1131                silk_SMULWW_4_neon( psDelDec->Shape_Q14[ i ], gain_adj_Q16_s32x2 );
   1132            }
   1133        }
   1134 
   1135        /* Save inverse gain */
   1136        NSQ->prev_gain_Q16 = Gains_Q16[ subfr ];
   1137    }
   1138 }