tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

NSQ_sse4_1.c (36013B)


      1 /* Copyright (c) 2014-2020, Cisco Systems, INC
      2   Written by XiangMingZhu WeiZhou MinPeng YanWang FrancisQuiers
      3 
      4   Redistribution and use in source and binary forms, with or without
      5   modification, are permitted provided that the following conditions
      6   are met:
      7 
      8   - Redistributions of source code must retain the above copyright
      9   notice, this list of conditions and the following disclaimer.
     10 
     11   - Redistributions in binary form must reproduce the above copyright
     12   notice, this list of conditions and the following disclaimer in the
     13   documentation and/or other materials provided with the distribution.
     14 
     15   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     16   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     17   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     18   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
     19   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     20   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     21   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     22   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
     23   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
     24   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     25   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26 */
     27 
     28 #ifdef HAVE_CONFIG_H
     29 #include "config.h"
     30 #endif
     31 
     32 #include <xmmintrin.h>
     33 #include <emmintrin.h>
     34 #include <smmintrin.h>
     35 #include "main.h"
     36 #include "celt/x86/x86cpu.h"
     37 #include "stack_alloc.h"
     38 
     39 static OPUS_INLINE void silk_nsq_scale_states_sse4_1(
     40    const silk_encoder_state *psEncC,              /* I    Encoder State                   */
     41    silk_nsq_state      *NSQ,                      /* I/O  NSQ state                       */
     42    const opus_int16    x16[],                     /* I    input                           */
     43    opus_int32          x_sc_Q10[],                /* O    input scaled with 1/Gain        */
     44    const opus_int16    sLTP[],                    /* I    re-whitened LTP state in Q0     */
     45    opus_int32          sLTP_Q15[],                /* O    LTP state matching scaled input */
     46    opus_int            subfr,                     /* I    subframe number                 */
     47    const opus_int      LTP_scale_Q14,             /* I                                    */
     48    const opus_int32    Gains_Q16[ MAX_NB_SUBFR ], /* I                                    */
     49    const opus_int      pitchL[ MAX_NB_SUBFR ],    /* I    Pitch lag                       */
     50    const opus_int      signal_type                /* I    Signal type                     */
     51 );
     52 
     53 static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
     54    silk_nsq_state      *NSQ,                   /* I/O  NSQ state                       */
     55    opus_int            signalType,             /* I    Signal type                     */
     56    const opus_int32    x_sc_Q10[],             /* I                                    */
     57    opus_int8           pulses[],               /* O                                    */
     58    opus_int16          xq[],                   /* O                                    */
     59    opus_int32          sLTP_Q15[],             /* I/O  LTP state                       */
     60    const opus_int16    a_Q12[],                /* I    Short term prediction coefs     */
     61    const opus_int16    b_Q14[],                /* I    Long term prediction coefs      */
     62    const opus_int16    AR_shp_Q13[],           /* I    Noise shaping AR coefs          */
     63    opus_int            lag,                    /* I    Pitch lag                       */
     64    opus_int32          HarmShapeFIRPacked_Q14, /* I                                    */
     65    opus_int            Tilt_Q14,               /* I    Spectral tilt                   */
     66    opus_int32          LF_shp_Q14,             /* I                                    */
     67    opus_int32          Gain_Q16,               /* I                                    */
     68    opus_int            Lambda_Q10,             /* I                                    */
     69    opus_int            offset_Q10,             /* I                                    */
     70    opus_int            length,                 /* I    Input length                    */
     71    opus_int32          table[][4]              /* I                                    */
     72 );
     73 
     74 void silk_NSQ_sse4_1(
     75    const silk_encoder_state    *psEncC,                                      /* I    Encoder State                   */
     76    silk_nsq_state              *NSQ,                                         /* I/O  NSQ state                       */
     77    SideInfoIndices             *psIndices,                                   /* I/O  Quantization Indices            */
     78    const opus_int16            x16[],                                        /* I    Input                           */
     79    opus_int8                   pulses[],                                     /* O    Quantized pulse signal          */
     80    const opus_int16            *PredCoef_Q12,                                /* I    Short term prediction coefs     */
     81    const opus_int16            LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ],      /* I    Long term prediction coefs      */
     82    const opus_int16            AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I    Noise shaping coefs             */
     83    const opus_int              HarmShapeGain_Q14[ MAX_NB_SUBFR ],            /* I    Long term shaping coefs         */
     84    const opus_int              Tilt_Q14[ MAX_NB_SUBFR ],                     /* I    Spectral tilt                   */
     85    const opus_int32            LF_shp_Q14[ MAX_NB_SUBFR ],                   /* I    Low frequency shaping coefs     */
     86    const opus_int32            Gains_Q16[ MAX_NB_SUBFR ],                    /* I    Quantization step sizes         */
     87    const opus_int              pitchL[ MAX_NB_SUBFR ],                       /* I    Pitch lags                      */
     88    const opus_int              Lambda_Q10,                                   /* I    Rate/distortion tradeoff        */
     89    const opus_int              LTP_scale_Q14                                 /* I    LTP state scaling               */
     90 )
     91 {
     92    opus_int            k, lag, start_idx, LSF_interpolation_flag;
     93    const opus_int16    *A_Q12, *B_Q14, *AR_shp_Q13;
     94    opus_int16          *pxq;
     95    VARDECL( opus_int32, sLTP_Q15 );
     96    VARDECL( opus_int16, sLTP );
     97    opus_int32          HarmShapeFIRPacked_Q14;
     98    opus_int            offset_Q10;
     99    VARDECL( opus_int32, x_sc_Q10 );
    100 
    101    opus_int32   table[ 64 ][ 4 ];
    102    opus_int32   tmp1;
    103    opus_int32   q1_Q10, q2_Q10, rd1_Q20, rd2_Q20;
    104 
    105 #ifdef OPUS_CHECK_ASM
    106    silk_nsq_state NSQ_c;
    107    SideInfoIndices psIndices_c;
    108    opus_int8 pulses_c[ MAX_FRAME_LENGTH ];
    109    const opus_int8 *const pulses_a = pulses;
    110 #endif
    111 
    112    SAVE_STACK;
    113 
    114 #ifdef OPUS_CHECK_ASM
    115    ( void )pulses_a;
    116    silk_memcpy( &NSQ_c, NSQ, sizeof( NSQ_c ) );
    117    silk_memcpy( &psIndices_c, psIndices, sizeof( psIndices_c ) );
    118    silk_assert( psEncC->nb_subfr * psEncC->subfr_length <= MAX_FRAME_LENGTH );
    119    silk_memcpy( pulses_c, pulses, psEncC->nb_subfr * psEncC->subfr_length * sizeof( pulses[0] ) );
    120 
    121    silk_NSQ_c(
    122        psEncC,
    123        &NSQ_c,
    124        &psIndices_c,
    125        x16,
    126        pulses_c,
    127        PredCoef_Q12,
    128        LTPCoef_Q14,
    129        AR_Q13,
    130        HarmShapeGain_Q14,
    131        Tilt_Q14,
    132        LF_shp_Q14,
    133        Gains_Q16,
    134        pitchL,
    135        Lambda_Q10,
    136        LTP_scale_Q14
    137    );
    138 #endif
    139 
    140    NSQ->rand_seed = psIndices->Seed;
    141 
    142    /* Set unvoiced lag to the previous one, overwrite later for voiced */
    143    lag = NSQ->lagPrev;
    144 
    145    silk_assert( NSQ->prev_gain_Q16 != 0 );
    146 
    147    offset_Q10 = silk_Quantization_Offsets_Q10[ psIndices->signalType >> 1 ][ psIndices->quantOffsetType ];
    148 
    149    /* 0 */
    150    q1_Q10  = offset_Q10;
    151    q2_Q10  = offset_Q10 + ( 1024 - QUANT_LEVEL_ADJUST_Q10 );
    152    rd1_Q20 = q1_Q10 * Lambda_Q10;
    153    rd2_Q20 = q2_Q10 * Lambda_Q10;
    154 
    155    table[ 32 ][ 0 ] = q1_Q10;
    156    table[ 32 ][ 1 ] = q2_Q10;
    157    table[ 32 ][ 2 ] = 2 * (q1_Q10 - q2_Q10);
    158    table[ 32 ][ 3 ] = (rd1_Q20 - rd2_Q20) + (q1_Q10 * q1_Q10 - q2_Q10 * q2_Q10);
    159 
    160    /* -1 */
    161    q1_Q10  = offset_Q10 - ( 1024 - QUANT_LEVEL_ADJUST_Q10 );
    162    q2_Q10  = offset_Q10;
    163    rd1_Q20 = - q1_Q10 * Lambda_Q10;
    164    rd2_Q20 = q2_Q10 * Lambda_Q10;
    165 
    166    table[ 31 ][ 0 ] = q1_Q10;
    167    table[ 31 ][ 1 ] = q2_Q10;
    168    table[ 31 ][ 2 ] = 2 * (q1_Q10 - q2_Q10);
    169    table[ 31 ][ 3 ] = (rd1_Q20 - rd2_Q20) + (q1_Q10 * q1_Q10 - q2_Q10 * q2_Q10);
    170 
    171    /* > 0 */
    172    for (k = 1; k <= 31; k++)
    173    {
    174        tmp1 = offset_Q10 + silk_LSHIFT( k, 10 );
    175 
    176        q1_Q10  = tmp1 - QUANT_LEVEL_ADJUST_Q10;
    177        q2_Q10  = tmp1 - QUANT_LEVEL_ADJUST_Q10 + 1024;
    178        rd1_Q20 = q1_Q10 * Lambda_Q10;
    179        rd2_Q20 = q2_Q10 * Lambda_Q10;
    180 
    181        table[ 32 + k ][ 0 ] = q1_Q10;
    182        table[ 32 + k ][ 1 ] = q2_Q10;
    183        table[ 32 + k ][ 2 ] = 2 * (q1_Q10 - q2_Q10);
    184        table[ 32 + k ][ 3 ] = (rd1_Q20 - rd2_Q20) + (q1_Q10 * q1_Q10 - q2_Q10 * q2_Q10);
    185    }
    186 
    187    /* < -1 */
    188    for (k = -32; k <= -2; k++)
    189    {
    190        tmp1 = offset_Q10 + silk_LSHIFT( k, 10 );
    191 
    192        q1_Q10  = tmp1 + QUANT_LEVEL_ADJUST_Q10;
    193        q2_Q10  = tmp1 + QUANT_LEVEL_ADJUST_Q10 + 1024;
    194        rd1_Q20 = - q1_Q10 * Lambda_Q10;
    195        rd2_Q20 = - q2_Q10 * Lambda_Q10;
    196 
    197        table[ 32 + k ][ 0 ] = q1_Q10;
    198        table[ 32 + k ][ 1 ] = q2_Q10;
    199        table[ 32 + k ][ 2 ] = 2 * (q1_Q10 - q2_Q10);
    200        table[ 32 + k ][ 3 ] = (rd1_Q20 - rd2_Q20) + (q1_Q10 * q1_Q10 - q2_Q10 * q2_Q10);
    201    }
    202 
    203    if( psIndices->NLSFInterpCoef_Q2 == 4 ) {
    204        LSF_interpolation_flag = 0;
    205    } else {
    206        LSF_interpolation_flag = 1;
    207    }
    208 
    209    ALLOC( sLTP_Q15, psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 );
    210    ALLOC( sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16 );
    211    ALLOC( x_sc_Q10, psEncC->subfr_length, opus_int32 );
    212    /* Set up pointers to start of sub frame */
    213    NSQ->sLTP_shp_buf_idx = psEncC->ltp_mem_length;
    214    NSQ->sLTP_buf_idx     = psEncC->ltp_mem_length;
    215    pxq                   = &NSQ->xq[ psEncC->ltp_mem_length ];
    216    for( k = 0; k < psEncC->nb_subfr; k++ ) {
    217        A_Q12      = &PredCoef_Q12[ (( k >> 1 ) | ( 1 - LSF_interpolation_flag )) * MAX_LPC_ORDER ];
    218        B_Q14      = &LTPCoef_Q14[ k * LTP_ORDER ];
    219        AR_shp_Q13 = &AR_Q13[ k * MAX_SHAPE_LPC_ORDER ];
    220 
    221        /* Noise shape parameters */
    222        silk_assert( HarmShapeGain_Q14[ k ] >= 0 );
    223        HarmShapeFIRPacked_Q14  =                          silk_RSHIFT( HarmShapeGain_Q14[ k ], 2 );
    224        HarmShapeFIRPacked_Q14 |= silk_LSHIFT( (opus_int32)silk_RSHIFT( HarmShapeGain_Q14[ k ], 1 ), 16 );
    225 
    226        NSQ->rewhite_flag = 0;
    227        if( psIndices->signalType == TYPE_VOICED ) {
    228            /* Voiced */
    229            lag = pitchL[ k ];
    230 
    231            /* Re-whitening */
    232            if( ( k & ( 3 - silk_LSHIFT( LSF_interpolation_flag, 1 ) ) ) == 0 ) {
    233                /* Rewhiten with new A coefs */
    234                start_idx = psEncC->ltp_mem_length - lag - psEncC->predictLPCOrder - LTP_ORDER / 2;
    235                celt_assert( start_idx > 0 );
    236 
    237                silk_LPC_analysis_filter( &sLTP[ start_idx ], &NSQ->xq[ start_idx + k * psEncC->subfr_length ],
    238                    A_Q12, psEncC->ltp_mem_length - start_idx, psEncC->predictLPCOrder, psEncC->arch );
    239 
    240                NSQ->rewhite_flag = 1;
    241                NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;
    242            }
    243        }
    244 
    245        silk_nsq_scale_states_sse4_1( psEncC, NSQ, x16, x_sc_Q10, sLTP, sLTP_Q15, k, LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType );
    246 
    247        if ( opus_likely( ( 10 == psEncC->shapingLPCOrder ) && ( 16 == psEncC->predictLPCOrder) ) )
    248        {
    249            silk_noise_shape_quantizer_10_16_sse4_1( NSQ, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15, A_Q12, B_Q14,
    250                AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ], Gains_Q16[ k ], Lambda_Q10,
    251                offset_Q10, psEncC->subfr_length, &(table[32]) );
    252        }
    253        else
    254        {
    255            silk_noise_shape_quantizer( NSQ, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15, A_Q12, B_Q14,
    256                AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ], Gains_Q16[ k ], Lambda_Q10,
    257                offset_Q10, psEncC->subfr_length, psEncC->shapingLPCOrder, psEncC->predictLPCOrder, psEncC->arch );
    258        }
    259 
    260        x16    += psEncC->subfr_length;
    261        pulses += psEncC->subfr_length;
    262        pxq    += psEncC->subfr_length;
    263    }
    264 
    265    /* Update lagPrev for next frame */
    266    NSQ->lagPrev = pitchL[ psEncC->nb_subfr - 1 ];
    267 
    268    /* Save quantized speech and noise shaping signals */
    269    silk_memmove( NSQ->xq,           &NSQ->xq[           psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int16 ) );
    270    silk_memmove( NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int32 ) );
    271 
    272 #ifdef OPUS_CHECK_ASM
    273    silk_assert( !memcmp( &NSQ_c, NSQ, sizeof( NSQ_c ) ) );
    274    silk_assert( !memcmp( &psIndices_c, psIndices, sizeof( psIndices_c ) ) );
    275    silk_assert( !memcmp( pulses_c, pulses_a, psEncC->nb_subfr * psEncC->subfr_length * sizeof( pulses[0] ) ) );
    276 #endif
    277 
    278    RESTORE_STACK;
    279 }
    280 
    281 /************************************/
    282 /* silk_noise_shape_quantizer_10_16 */
    283 /************************************/
    284 static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
    285    silk_nsq_state      *NSQ,                   /* I/O  NSQ state                       */
    286    opus_int            signalType,             /* I    Signal type                     */
    287    const opus_int32    x_sc_Q10[],             /* I                                    */
    288    opus_int8           pulses[],               /* O                                    */
    289    opus_int16          xq[],                   /* O                                    */
    290    opus_int32          sLTP_Q15[],             /* I/O  LTP state                       */
    291    const opus_int16    a_Q12[],                /* I    Short term prediction coefs     */
    292    const opus_int16    b_Q14[],                /* I    Long term prediction coefs      */
    293    const opus_int16    AR_shp_Q13[],           /* I    Noise shaping AR coefs          */
    294    opus_int            lag,                    /* I    Pitch lag                       */
    295    opus_int32          HarmShapeFIRPacked_Q14, /* I                                    */
    296    opus_int            Tilt_Q14,               /* I    Spectral tilt                   */
    297    opus_int32          LF_shp_Q14,             /* I                                    */
    298    opus_int32          Gain_Q16,               /* I                                    */
    299    opus_int            Lambda_Q10,             /* I                                    */
    300    opus_int            offset_Q10,             /* I                                    */
    301    opus_int            length,                 /* I    Input length                    */
    302    opus_int32          table[][4]              /* I                                    */
    303 )
    304 {
    305    opus_int     i;
    306    opus_int32   LTP_pred_Q13, LPC_pred_Q10, n_AR_Q12, n_LTP_Q13;
    307    opus_int32   n_LF_Q12, r_Q10, q1_Q0, q1_Q10, q2_Q10;
    308    opus_int32   exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10, sDiff_shp_Q14;
    309    opus_int32   tmp1, tmp2, sLF_AR_shp_Q14;
    310    opus_int32   *psLPC_Q14, *shp_lag_ptr, *pred_lag_ptr;
    311 
    312    __m128i xmm_tempa, xmm_tempb;
    313 
    314    __m128i xmm_one;
    315 
    316    __m128i psLPC_Q14_hi_01234567, psLPC_Q14_hi_89ABCDEF;
    317    __m128i psLPC_Q14_lo_01234567, psLPC_Q14_lo_89ABCDEF;
    318    __m128i a_Q12_01234567,        a_Q12_89ABCDEF;
    319 
    320    __m128i sAR2_Q14_hi_76543210, sAR2_Q14_lo_76543210;
    321    __m128i AR_shp_Q13_76543210;
    322 
    323    int rdo_offset = (Lambda_Q10 >> 1) - 512;
    324 
    325    shp_lag_ptr  = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2 ];
    326    pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ];
    327    Gain_Q10     = silk_RSHIFT( Gain_Q16, 6 );
    328 
    329    /* Set up short term AR state */
    330    psLPC_Q14 = &NSQ->sLPC_Q14[ NSQ_LPC_BUF_LENGTH - 1 ];
    331 
    332    sLF_AR_shp_Q14 = NSQ->sLF_AR_shp_Q14;
    333    xq_Q14         = psLPC_Q14[ 0 ];
    334    sDiff_shp_Q14  = NSQ->sDiff_shp_Q14;
    335    LTP_pred_Q13   = 0;
    336 
    337    /* load a_Q12 */
    338    xmm_one = _mm_set_epi8( 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 );
    339 
    340    /* load a_Q12[0] - a_Q12[7] */
    341    a_Q12_01234567 = _mm_loadu_si128( (__m128i *)(void*)(&a_Q12[ 0 ] ) );
    342    /* load a_Q12[ 8 ] - a_Q12[ 15 ] */
    343    a_Q12_89ABCDEF = _mm_loadu_si128( (__m128i *)(void*)(&a_Q12[ 8 ] ) );
    344 
    345    a_Q12_01234567 = _mm_shuffle_epi8( a_Q12_01234567, xmm_one );
    346    a_Q12_89ABCDEF = _mm_shuffle_epi8( a_Q12_89ABCDEF, xmm_one );
    347 
    348    /* load AR_shp_Q13 */
    349    AR_shp_Q13_76543210 = _mm_loadu_si128( (__m128i *)(void*)(&AR_shp_Q13[0] ) );
    350 
    351    /* load psLPC_Q14 */
    352    xmm_one = _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0 );
    353 
    354    xmm_tempa = _mm_loadu_si128( (__m128i *)(void*)(&psLPC_Q14[-16]) );
    355    xmm_tempb = _mm_loadu_si128( (__m128i *)(void*)(&psLPC_Q14[-12]) );
    356 
    357    xmm_tempa = _mm_shuffle_epi8( xmm_tempa, xmm_one );
    358    xmm_tempb = _mm_shuffle_epi8( xmm_tempb, xmm_one );
    359 
    360    psLPC_Q14_hi_89ABCDEF = _mm_unpackhi_epi64( xmm_tempa, xmm_tempb );
    361    psLPC_Q14_lo_89ABCDEF = _mm_unpacklo_epi64( xmm_tempa, xmm_tempb );
    362 
    363    xmm_tempa = _mm_loadu_si128( (__m128i *)(void*)(&psLPC_Q14[ -8 ]) );
    364    xmm_tempb = _mm_loadu_si128( (__m128i *)(void*)(&psLPC_Q14[ -4 ]) );
    365 
    366    xmm_tempa = _mm_shuffle_epi8( xmm_tempa, xmm_one );
    367    xmm_tempb = _mm_shuffle_epi8( xmm_tempb, xmm_one );
    368 
    369    psLPC_Q14_hi_01234567 = _mm_unpackhi_epi64( xmm_tempa, xmm_tempb );
    370    psLPC_Q14_lo_01234567 = _mm_unpacklo_epi64( xmm_tempa, xmm_tempb );
    371 
    372    /* load sAR2_Q14 */
    373    xmm_tempa = _mm_loadu_si128( (__m128i *)(void*)(&(NSQ->sAR2_Q14[ 0 ]) ) );
    374    xmm_tempb = _mm_loadu_si128( (__m128i *)(void*)(&(NSQ->sAR2_Q14[ 4 ]) ) );
    375 
    376    xmm_tempa = _mm_shuffle_epi8( xmm_tempa, xmm_one );
    377    xmm_tempb = _mm_shuffle_epi8( xmm_tempb, xmm_one );
    378 
    379    sAR2_Q14_hi_76543210 = _mm_unpackhi_epi64( xmm_tempa, xmm_tempb );
    380    sAR2_Q14_lo_76543210 = _mm_unpacklo_epi64( xmm_tempa, xmm_tempb );
    381 
    382    /* prepare 1 in 8 * 16bit */
    383    xmm_one = _mm_set1_epi16(1);
    384 
    385    for( i = 0; i < length; i++ )
    386    {
    387        /* Short-term prediction */
    388        __m128i xmm_hi_07, xmm_hi_8F, xmm_lo_07, xmm_lo_8F;
    389 
    390        /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
    391        LPC_pred_Q10 = 8; /* silk_RSHIFT( predictLPCOrder, 1 ); */
    392 
    393        /* shift psLPC_Q14 */
    394        psLPC_Q14_hi_89ABCDEF = _mm_alignr_epi8( psLPC_Q14_hi_01234567, psLPC_Q14_hi_89ABCDEF, 2 );
    395        psLPC_Q14_lo_89ABCDEF = _mm_alignr_epi8( psLPC_Q14_lo_01234567, psLPC_Q14_lo_89ABCDEF, 2 );
    396 
    397        psLPC_Q14_hi_01234567 = _mm_srli_si128( psLPC_Q14_hi_01234567, 2 );
    398        psLPC_Q14_lo_01234567 = _mm_srli_si128( psLPC_Q14_lo_01234567, 2 );
    399 
    400        psLPC_Q14_hi_01234567 = _mm_insert_epi16( psLPC_Q14_hi_01234567, (xq_Q14 >> 16), 7 );
    401        psLPC_Q14_lo_01234567 = _mm_insert_epi16( psLPC_Q14_lo_01234567, (xq_Q14),       7 );
    402 
    403        /* high part, use pmaddwd, results in 4 32-bit */
    404        xmm_hi_07 = _mm_madd_epi16( psLPC_Q14_hi_01234567, a_Q12_01234567 );
    405        xmm_hi_8F = _mm_madd_epi16( psLPC_Q14_hi_89ABCDEF, a_Q12_89ABCDEF );
    406 
    407        /* low part, use pmulhw, results in 8 16-bit, note we need simulate unsigned * signed, _mm_srai_epi16(psLPC_Q14_lo_01234567, 15) */
    408        xmm_tempa = _mm_cmpgt_epi16( _mm_setzero_si128(), psLPC_Q14_lo_01234567 );
    409        xmm_tempb = _mm_cmpgt_epi16( _mm_setzero_si128(), psLPC_Q14_lo_89ABCDEF );
    410 
    411        xmm_tempa = _mm_and_si128( xmm_tempa, a_Q12_01234567 );
    412        xmm_tempb = _mm_and_si128( xmm_tempb, a_Q12_89ABCDEF );
    413 
    414        xmm_lo_07 = _mm_mulhi_epi16( psLPC_Q14_lo_01234567, a_Q12_01234567 );
    415        xmm_lo_8F = _mm_mulhi_epi16( psLPC_Q14_lo_89ABCDEF, a_Q12_89ABCDEF );
    416 
    417        xmm_lo_07 = _mm_add_epi16( xmm_lo_07, xmm_tempa );
    418        xmm_lo_8F = _mm_add_epi16( xmm_lo_8F, xmm_tempb );
    419 
    420        xmm_lo_07 = _mm_madd_epi16( xmm_lo_07, xmm_one );
    421        xmm_lo_8F = _mm_madd_epi16( xmm_lo_8F, xmm_one );
    422 
    423        /* accumulate */
    424        xmm_hi_07 = _mm_add_epi32( xmm_hi_07, xmm_hi_8F );
    425        xmm_lo_07 = _mm_add_epi32( xmm_lo_07, xmm_lo_8F );
    426 
    427        xmm_hi_07 = _mm_add_epi32( xmm_hi_07, xmm_lo_07 );
    428 
    429        xmm_hi_07 = _mm_add_epi32( xmm_hi_07, _mm_unpackhi_epi64(xmm_hi_07, xmm_hi_07 ) );
    430        xmm_hi_07 = _mm_add_epi32( xmm_hi_07, _mm_shufflelo_epi16(xmm_hi_07, 0x0E ) );
    431 
    432        LPC_pred_Q10 += _mm_cvtsi128_si32( xmm_hi_07 );
    433 
    434        /* Long-term prediction */
    435        if ( opus_likely( signalType == TYPE_VOICED ) ) {
    436            /* Unrolled loop */
    437            /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
    438            LTP_pred_Q13 = 2;
    439            {
    440                __m128i b_Q14_3210, b_Q14_0123, pred_lag_ptr_0123;
    441 
    442                b_Q14_3210 = OP_CVTEPI16_EPI32_M64( b_Q14 );
    443                b_Q14_0123 = _mm_shuffle_epi32( b_Q14_3210, 0x1B );
    444 
    445                /* loaded: [0] [-1] [-2] [-3] */
    446                pred_lag_ptr_0123 = _mm_loadu_si128( (__m128i *)(void*)(&pred_lag_ptr[ -3 ] ) );
    447                /* shuffle to [-3] [-2] [-1] [0] and to new xmm */
    448                xmm_tempa = _mm_shuffle_epi32( pred_lag_ptr_0123, 0x1B );
    449                /*64-bit multiply, a[2] * b[-2], a[0] * b[0] */
    450                xmm_tempa = _mm_mul_epi32( xmm_tempa, b_Q14_3210 );
    451                /* right shift 2 bytes (16 bits), zero extended */
    452                xmm_tempa = _mm_srli_si128( xmm_tempa, 2 );
    453 
    454                /* a[1] * b[-1], a[3] * b[-3] */
    455                pred_lag_ptr_0123 = _mm_mul_epi32( pred_lag_ptr_0123, b_Q14_0123 );
    456                pred_lag_ptr_0123 = _mm_srli_si128( pred_lag_ptr_0123, 2 );
    457 
    458                pred_lag_ptr_0123 = _mm_add_epi32( pred_lag_ptr_0123, xmm_tempa );
    459                /* equal shift right 8 bytes*/
    460                xmm_tempa = _mm_shuffle_epi32( pred_lag_ptr_0123, _MM_SHUFFLE( 0, 0, 3, 2 ) );
    461                xmm_tempa = _mm_add_epi32( xmm_tempa, pred_lag_ptr_0123 );
    462 
    463                LTP_pred_Q13 += _mm_cvtsi128_si32( xmm_tempa );
    464 
    465                LTP_pred_Q13 = silk_SMLAWB( LTP_pred_Q13, pred_lag_ptr[ -4 ], b_Q14[ 4 ] );
    466                pred_lag_ptr++;
    467            }
    468        }
    469 
    470        /* Noise shape feedback */
    471        NSQ->sAR2_Q14[ 9 ] = NSQ->sAR2_Q14[ 8 ];
    472        NSQ->sAR2_Q14[ 8 ] = _mm_cvtsi128_si32( _mm_srli_si128(_mm_unpackhi_epi16( sAR2_Q14_lo_76543210, sAR2_Q14_hi_76543210 ), 12 ) );
    473 
    474        sAR2_Q14_hi_76543210 = _mm_slli_si128( sAR2_Q14_hi_76543210, 2 );
    475        sAR2_Q14_lo_76543210 = _mm_slli_si128( sAR2_Q14_lo_76543210, 2 );
    476 
    477        sAR2_Q14_hi_76543210 = _mm_insert_epi16( sAR2_Q14_hi_76543210, (sDiff_shp_Q14 >> 16), 0 );
    478        sAR2_Q14_lo_76543210 = _mm_insert_epi16( sAR2_Q14_lo_76543210, (sDiff_shp_Q14),       0 );
    479 
    480        /* high part, use pmaddwd, results in 4 32-bit */
    481        xmm_hi_07 = _mm_madd_epi16( sAR2_Q14_hi_76543210, AR_shp_Q13_76543210 );
    482 
    483        /* low part, use pmulhw, results in 8 16-bit, note we need simulate unsigned * signed,_mm_srai_epi16(sAR2_Q14_lo_76543210, 15) */
    484        xmm_tempa = _mm_cmpgt_epi16( _mm_setzero_si128(), sAR2_Q14_lo_76543210 );
    485        xmm_tempa = _mm_and_si128( xmm_tempa, AR_shp_Q13_76543210 );
    486 
    487        xmm_lo_07 = _mm_mulhi_epi16( sAR2_Q14_lo_76543210, AR_shp_Q13_76543210 );
    488        xmm_lo_07 = _mm_add_epi16( xmm_lo_07, xmm_tempa );
    489 
    490        xmm_lo_07 = _mm_madd_epi16( xmm_lo_07, xmm_one );
    491 
    492        /* accumulate */
    493        xmm_hi_07 = _mm_add_epi32( xmm_hi_07, xmm_lo_07 );
    494 
    495        xmm_hi_07 = _mm_add_epi32( xmm_hi_07, _mm_unpackhi_epi64(xmm_hi_07, xmm_hi_07 ) );
    496        xmm_hi_07 = _mm_add_epi32( xmm_hi_07, _mm_shufflelo_epi16(xmm_hi_07, 0x0E ) );
    497 
    498        n_AR_Q12 = 5 + _mm_cvtsi128_si32( xmm_hi_07 );
    499 
    500        n_AR_Q12 = silk_SMLAWB( n_AR_Q12, NSQ->sAR2_Q14[ 8 ], AR_shp_Q13[ 8 ] );
    501        n_AR_Q12 = silk_SMLAWB( n_AR_Q12, NSQ->sAR2_Q14[ 9 ], AR_shp_Q13[ 9 ] );
    502 
    503        n_AR_Q12 = silk_LSHIFT32( n_AR_Q12, 1 );                                /* Q11 -> Q12 */
    504        n_AR_Q12 = silk_SMLAWB( n_AR_Q12, sLF_AR_shp_Q14, Tilt_Q14 );
    505 
    506        n_LF_Q12 = silk_SMULWB( NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - 1 ], LF_shp_Q14 );
    507        n_LF_Q12 = silk_SMLAWT( n_LF_Q12, sLF_AR_shp_Q14, LF_shp_Q14 );
    508 
    509        celt_assert( lag > 0 || signalType != TYPE_VOICED );
    510 
    511        /* Combine prediction and noise shaping signals */
    512        tmp1 = silk_SUB32( silk_LSHIFT32( LPC_pred_Q10, 2 ), n_AR_Q12 );        /* Q12 */
    513        tmp1 = silk_SUB32( tmp1, n_LF_Q12 );                                    /* Q12 */
    514        if( lag > 0 ) {
    515            /* Symmetric, packed FIR coefficients */
    516            n_LTP_Q13 = silk_SMULWB( silk_ADD_SAT32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );
    517            n_LTP_Q13 = silk_SMLAWT( n_LTP_Q13, shp_lag_ptr[ -1 ],                      HarmShapeFIRPacked_Q14 );
    518            n_LTP_Q13 = silk_LSHIFT( n_LTP_Q13, 1 );
    519            shp_lag_ptr++;
    520 
    521            tmp2 = silk_SUB32( LTP_pred_Q13, n_LTP_Q13 );                       /* Q13 */
    522            tmp1 = silk_ADD_LSHIFT32( tmp2, tmp1, 1 );                          /* Q13 */
    523            tmp1 = silk_RSHIFT_ROUND( tmp1, 3 );                                /* Q10 */
    524        } else {
    525            tmp1 = silk_RSHIFT_ROUND( tmp1, 2 );                                /* Q10 */
    526        }
    527 
    528        r_Q10 = silk_SUB32( x_sc_Q10[ i ], tmp1 );                              /* residual error Q10 */
    529 
    530        /* Generate dither */
    531        NSQ->rand_seed = silk_RAND( NSQ->rand_seed );
    532 
    533        /* Flip sign depending on dither */
    534        tmp2 = -r_Q10;
    535        if ( NSQ->rand_seed < 0 ) r_Q10 = tmp2;
    536 
    537        r_Q10 = silk_LIMIT_32( r_Q10, -(31 << 10), 30 << 10 );
    538 
    539        /* Find two quantization level candidates and measure their rate-distortion */
    540        q1_Q10 = silk_SUB32( r_Q10, offset_Q10 );
    541        q1_Q0 = silk_RSHIFT( q1_Q10, 10 );
    542        if (Lambda_Q10 > 2048) {
    543            /* For aggressive RDO, the bias becomes more than one pulse. */
    544            if (q1_Q10 > rdo_offset) {
    545                q1_Q0 = silk_RSHIFT( q1_Q10 - rdo_offset, 10 );
    546            } else if (q1_Q10 < -rdo_offset) {
    547                q1_Q0 = silk_RSHIFT( q1_Q10 + rdo_offset, 10 );
    548            } else if (q1_Q10 < 0) {
    549                q1_Q0 = -1;
    550            } else {
    551                q1_Q0 = 0;
    552            }
    553        }
    554 
    555        q1_Q10 = table[q1_Q0][0];
    556        q2_Q10 = table[q1_Q0][1];
    557 
    558        if (r_Q10 * table[q1_Q0][2] - table[q1_Q0][3] < 0)
    559        {
    560            q1_Q10 = q2_Q10;
    561        }
    562 
    563        pulses[ i ] = (opus_int8)silk_RSHIFT_ROUND( q1_Q10, 10 );
    564 
    565        /* Excitation */
    566        exc_Q14 = silk_LSHIFT( q1_Q10, 4 );
    567 
    568        tmp2 = -exc_Q14;
    569        if ( NSQ->rand_seed < 0 ) exc_Q14 = tmp2;
    570 
    571        /* Add predictions */
    572        LPC_exc_Q14 = silk_ADD_LSHIFT32( exc_Q14, LTP_pred_Q13, 1 );
    573        xq_Q14      = silk_ADD_LSHIFT32( LPC_exc_Q14, LPC_pred_Q10, 4 );
    574 
    575        /* Update states */
    576        psLPC_Q14++;
    577        *psLPC_Q14 = xq_Q14;
    578        NSQ->sDiff_shp_Q14 = silk_SUB_LSHIFT32( xq_Q14, x_sc_Q10[ i ], 4 );
    579        sLF_AR_shp_Q14 = silk_SUB_LSHIFT32( NSQ->sDiff_shp_Q14, n_AR_Q12, 2 );
    580 
    581        NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx ] = silk_SUB_LSHIFT32( sLF_AR_shp_Q14, n_LF_Q12, 2 );
    582        sLTP_Q15[ NSQ->sLTP_buf_idx ] = silk_LSHIFT( LPC_exc_Q14, 1 );
    583        NSQ->sLTP_shp_buf_idx++;
    584        NSQ->sLTP_buf_idx++;
    585 
    586        /* Make dither dependent on quantized signal */
    587        NSQ->rand_seed = silk_ADD32_ovflw( NSQ->rand_seed, pulses[ i ] );
    588    }
    589 
    590    NSQ->sLF_AR_shp_Q14 = sLF_AR_shp_Q14;
    591 
    592    /* Scale XQ back to normal level before saving */
    593    psLPC_Q14 = &NSQ->sLPC_Q14[ NSQ_LPC_BUF_LENGTH ];
    594 
    595    /* write back sAR2_Q14 */
    596    xmm_tempa = _mm_unpackhi_epi16( sAR2_Q14_lo_76543210, sAR2_Q14_hi_76543210 );
    597    xmm_tempb = _mm_unpacklo_epi16( sAR2_Q14_lo_76543210, sAR2_Q14_hi_76543210 );
    598    _mm_storeu_si128( (__m128i *)(void*)(&NSQ->sAR2_Q14[ 4 ]), xmm_tempa );
    599    _mm_storeu_si128( (__m128i *)(void*)(&NSQ->sAR2_Q14[ 0 ]), xmm_tempb );
    600 
    601    /* xq[ i ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( silk_SMULWW( psLPC_Q14[ i ], Gain_Q10 ), 8 ) ); */
    602    {
    603        __m128i xmm_Gain_Q10;
    604        __m128i xmm_xq_Q14_3210, xmm_xq_Q14_x3x1, xmm_xq_Q14_7654, xmm_xq_Q14_x7x5;
    605 
    606        /* prepare (1 << 7) in packed 4 32-bits */
    607        xmm_tempa = _mm_set1_epi32( (1 << 7) );
    608 
    609        /* prepare Gain_Q10 in packed 4 32-bits */
    610        xmm_Gain_Q10 = _mm_set1_epi32( Gain_Q10 );
    611 
    612        /* process xq */
    613        for (i = 0; i < length - 7; i += 8)
    614        {
    615            xmm_xq_Q14_3210 = _mm_loadu_si128( (__m128i *)(void*)(&(psLPC_Q14[ i + 0 ] ) ) );
    616            xmm_xq_Q14_7654 = _mm_loadu_si128( (__m128i *)(void*)(&(psLPC_Q14[ i + 4 ] ) ) );
    617 
    618            /* equal shift right 4 bytes*/
    619            xmm_xq_Q14_x3x1 = _mm_shuffle_epi32( xmm_xq_Q14_3210, _MM_SHUFFLE( 0, 3, 2, 1 ) );
    620            /* equal shift right 4 bytes*/
    621            xmm_xq_Q14_x7x5 = _mm_shuffle_epi32( xmm_xq_Q14_7654, _MM_SHUFFLE( 0, 3, 2, 1 ) );
    622 
    623            xmm_xq_Q14_3210 = _mm_mul_epi32( xmm_xq_Q14_3210, xmm_Gain_Q10 );
    624            xmm_xq_Q14_x3x1 = _mm_mul_epi32( xmm_xq_Q14_x3x1, xmm_Gain_Q10 );
    625            xmm_xq_Q14_7654 = _mm_mul_epi32( xmm_xq_Q14_7654, xmm_Gain_Q10 );
    626            xmm_xq_Q14_x7x5 = _mm_mul_epi32( xmm_xq_Q14_x7x5, xmm_Gain_Q10 );
    627 
    628            xmm_xq_Q14_3210 = _mm_srli_epi64( xmm_xq_Q14_3210, 16 );
    629            xmm_xq_Q14_x3x1 = _mm_slli_epi64( xmm_xq_Q14_x3x1, 16 );
    630            xmm_xq_Q14_7654 = _mm_srli_epi64( xmm_xq_Q14_7654, 16 );
    631            xmm_xq_Q14_x7x5 = _mm_slli_epi64( xmm_xq_Q14_x7x5, 16 );
    632 
    633            xmm_xq_Q14_3210 = _mm_blend_epi16( xmm_xq_Q14_3210, xmm_xq_Q14_x3x1, 0xCC );
    634            xmm_xq_Q14_7654 = _mm_blend_epi16( xmm_xq_Q14_7654, xmm_xq_Q14_x7x5, 0xCC );
    635 
    636            /* silk_RSHIFT_ROUND(xq, 8) */
    637            xmm_xq_Q14_3210 = _mm_add_epi32( xmm_xq_Q14_3210, xmm_tempa );
    638            xmm_xq_Q14_7654 = _mm_add_epi32( xmm_xq_Q14_7654, xmm_tempa );
    639 
    640            xmm_xq_Q14_3210 = _mm_srai_epi32( xmm_xq_Q14_3210, 8 );
    641            xmm_xq_Q14_7654 = _mm_srai_epi32( xmm_xq_Q14_7654, 8 );
    642 
    643            /* silk_SAT16 */
    644            xmm_xq_Q14_3210 = _mm_packs_epi32( xmm_xq_Q14_3210, xmm_xq_Q14_7654 );
    645 
    646            /* save to xq */
    647            _mm_storeu_si128( (__m128i *)(void*)(&xq[ i ] ), xmm_xq_Q14_3210 );
    648        }
    649    }
    650    for ( ; i < length; i++)
    651    {
    652        xq[i] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( silk_SMULWW( psLPC_Q14[ i ], Gain_Q10 ), 8 ) );
    653    }
    654 
    655    /* Update LPC synth buffer */
    656    silk_memcpy( NSQ->sLPC_Q14, &NSQ->sLPC_Q14[ length ], NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) );
    657 }
    658 
    659 static OPUS_INLINE void silk_nsq_scale_states_sse4_1(
    660    const silk_encoder_state *psEncC,              /* I    Encoder State                   */
    661    silk_nsq_state      *NSQ,                      /* I/O  NSQ state                       */
    662    const opus_int16    x16[],                     /* I    input                           */
    663    opus_int32          x_sc_Q10[],                /* O    input scaled with 1/Gain        */
    664    const opus_int16    sLTP[],                    /* I    re-whitened LTP state in Q0     */
    665    opus_int32          sLTP_Q15[],                /* O    LTP state matching scaled input */
    666    opus_int            subfr,                     /* I    subframe number                 */
    667    const opus_int      LTP_scale_Q14,             /* I                                    */
    668    const opus_int32    Gains_Q16[ MAX_NB_SUBFR ], /* I                                    */
    669    const opus_int      pitchL[ MAX_NB_SUBFR ],    /* I    Pitch lag                       */
    670    const opus_int      signal_type                /* I    Signal type                     */
    671 )
    672 {
    673    opus_int   i, lag;
    674    opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q26;
    675    __m128i xmm_inv_gain_Q26, xmm_x16_x2x0, xmm_x16_x3x1;
    676 
    677    lag          = pitchL[ subfr ];
    678    inv_gain_Q31 = silk_INVERSE32_varQ( silk_max( Gains_Q16[ subfr ], 1 ), 47 );
    679    silk_assert( inv_gain_Q31 != 0 );
    680 
    681    /* Scale input */
    682    inv_gain_Q26 = silk_RSHIFT_ROUND( inv_gain_Q31, 5 );
    683 
    684    /* prepare inv_gain_Q26 in packed 4 32-bits */
    685    xmm_inv_gain_Q26 = _mm_set1_epi32(inv_gain_Q26);
    686 
    687    for( i = 0; i < psEncC->subfr_length - 3; i += 4 ) {
    688        xmm_x16_x2x0 = OP_CVTEPI16_EPI32_M64( &(x16[ i ] ) );
    689 
    690        /* equal shift right 4 bytes*/
    691        xmm_x16_x3x1 = _mm_shuffle_epi32( xmm_x16_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
    692 
    693        xmm_x16_x2x0 = _mm_mul_epi32( xmm_x16_x2x0, xmm_inv_gain_Q26 );
    694        xmm_x16_x3x1 = _mm_mul_epi32( xmm_x16_x3x1, xmm_inv_gain_Q26 );
    695 
    696        xmm_x16_x2x0 = _mm_srli_epi64( xmm_x16_x2x0, 16 );
    697        xmm_x16_x3x1 = _mm_slli_epi64( xmm_x16_x3x1, 16 );
    698 
    699        xmm_x16_x2x0 = _mm_blend_epi16( xmm_x16_x2x0, xmm_x16_x3x1, 0xCC );
    700 
    701        _mm_storeu_si128( (__m128i *)(void*)(&(x_sc_Q10[ i ] ) ), xmm_x16_x2x0 );
    702    }
    703 
    704    for( ; i < psEncC->subfr_length; i++ ) {
    705        x_sc_Q10[ i ] = silk_SMULWW( x16[ i ], inv_gain_Q26 );
    706    }
    707 
    708    /* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */
    709    if( NSQ->rewhite_flag ) {
    710        if( subfr == 0 ) {
    711            /* Do LTP downscaling */
    712            inv_gain_Q31 = silk_LSHIFT( silk_SMULWB( inv_gain_Q31, LTP_scale_Q14 ), 2 );
    713        }
    714        for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx; i++ ) {
    715            silk_assert( i < MAX_FRAME_LENGTH );
    716            sLTP_Q15[ i ] = silk_SMULWB( inv_gain_Q31, sLTP[ i ] );
    717        }
    718    }
    719 
    720    /* Adjust for changing gain */
    721    if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) {
    722        __m128i xmm_gain_adj_Q16, xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1;
    723        gain_adj_Q16 =  silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 );
    724 
    725        /* Scale long-term shaping state */
    726 
    727        /* prepare gain_adj_Q16 in packed 4 32-bits */
    728        xmm_gain_adj_Q16 = _mm_set1_epi32(gain_adj_Q16);
    729 
    730        for( i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sLTP_shp_buf_idx - 3; i += 4 )
    731        {
    732            xmm_sLTP_shp_Q14_x2x0 = _mm_loadu_si128( (__m128i *)(void*)(&(NSQ->sLTP_shp_Q14[ i ] ) ) );
    733            /* equal shift right 4 bytes*/
    734            xmm_sLTP_shp_Q14_x3x1 = _mm_shuffle_epi32( xmm_sLTP_shp_Q14_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
    735 
    736            xmm_sLTP_shp_Q14_x2x0 = _mm_mul_epi32( xmm_sLTP_shp_Q14_x2x0, xmm_gain_adj_Q16 );
    737            xmm_sLTP_shp_Q14_x3x1 = _mm_mul_epi32( xmm_sLTP_shp_Q14_x3x1, xmm_gain_adj_Q16 );
    738 
    739            xmm_sLTP_shp_Q14_x2x0 = _mm_srli_epi64( xmm_sLTP_shp_Q14_x2x0, 16 );
    740            xmm_sLTP_shp_Q14_x3x1 = _mm_slli_epi64( xmm_sLTP_shp_Q14_x3x1, 16 );
    741 
    742            xmm_sLTP_shp_Q14_x2x0 = _mm_blend_epi16( xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1, 0xCC );
    743 
    744            _mm_storeu_si128( (__m128i *)(void*)(&(NSQ->sLTP_shp_Q14[ i ] ) ), xmm_sLTP_shp_Q14_x2x0 );
    745        }
    746 
    747        for( ; i < NSQ->sLTP_shp_buf_idx; i++ ) {
    748            NSQ->sLTP_shp_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sLTP_shp_Q14[ i ] );
    749        }
    750 
    751        /* Scale long-term prediction state */
    752        if( signal_type == TYPE_VOICED && NSQ->rewhite_flag == 0 ) {
    753            for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx; i++ ) {
    754                sLTP_Q15[ i ] = silk_SMULWW( gain_adj_Q16, sLTP_Q15[ i ] );
    755            }
    756        }
    757 
    758        NSQ->sLF_AR_shp_Q14 = silk_SMULWW( gain_adj_Q16, NSQ->sLF_AR_shp_Q14 );
    759        NSQ->sDiff_shp_Q14 = silk_SMULWW( gain_adj_Q16, NSQ->sDiff_shp_Q14 );
    760 
    761        /* Scale short-term prediction and shaping states */
    762        for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) {
    763            NSQ->sLPC_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sLPC_Q14[ i ] );
    764        }
    765        for( i = 0; i < MAX_SHAPE_LPC_ORDER; i++ ) {
    766            NSQ->sAR2_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sAR2_Q14[ i ] );
    767        }
    768 
    769        /* Save inverse gain */
    770        NSQ->prev_gain_Q16 = Gains_Q16[ subfr ];
    771    }
    772 }