NSQ_sse4_1.c (36013B)
1 /* Copyright (c) 2014-2020, Cisco Systems, INC 2 Written by XiangMingZhu WeiZhou MinPeng YanWang FrancisQuiers 3 4 Redistribution and use in source and binary forms, with or without 5 modification, are permitted provided that the following conditions 6 are met: 7 8 - Redistributions of source code must retain the above copyright 9 notice, this list of conditions and the following disclaimer. 10 11 - Redistributions in binary form must reproduce the above copyright 12 notice, this list of conditions and the following disclaimer in the 13 documentation and/or other materials provided with the distribution. 14 15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 16 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER 19 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 23 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 24 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28 #ifdef HAVE_CONFIG_H 29 #include "config.h" 30 #endif 31 32 #include <xmmintrin.h> 33 #include <emmintrin.h> 34 #include <smmintrin.h> 35 #include "main.h" 36 #include "celt/x86/x86cpu.h" 37 #include "stack_alloc.h" 38 39 static OPUS_INLINE void silk_nsq_scale_states_sse4_1( 40 const silk_encoder_state *psEncC, /* I Encoder State */ 41 silk_nsq_state *NSQ, /* I/O NSQ state */ 42 const opus_int16 x16[], /* I input */ 43 opus_int32 x_sc_Q10[], /* O input scaled with 1/Gain */ 44 const opus_int16 sLTP[], /* I re-whitened LTP state in Q0 */ 45 opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */ 46 opus_int subfr, /* I subframe number */ 47 const opus_int LTP_scale_Q14, /* I */ 48 const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I */ 49 const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lag */ 50 const opus_int signal_type /* I Signal type */ 51 ); 52 53 static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1( 54 silk_nsq_state *NSQ, /* I/O NSQ state */ 55 opus_int signalType, /* I Signal type */ 56 const opus_int32 x_sc_Q10[], /* I */ 57 opus_int8 pulses[], /* O */ 58 opus_int16 xq[], /* O */ 59 opus_int32 sLTP_Q15[], /* I/O LTP state */ 60 const opus_int16 a_Q12[], /* I Short term prediction coefs */ 61 const opus_int16 b_Q14[], /* I Long term prediction coefs */ 62 const opus_int16 AR_shp_Q13[], /* I Noise shaping AR coefs */ 63 opus_int lag, /* I Pitch lag */ 64 opus_int32 HarmShapeFIRPacked_Q14, /* I */ 65 opus_int Tilt_Q14, /* I Spectral tilt */ 66 opus_int32 LF_shp_Q14, /* I */ 67 opus_int32 Gain_Q16, /* I */ 68 opus_int Lambda_Q10, /* I */ 69 opus_int offset_Q10, /* I */ 70 opus_int length, /* I Input length */ 71 opus_int32 table[][4] /* I */ 72 ); 73 74 void silk_NSQ_sse4_1( 75 const silk_encoder_state *psEncC, /* I Encoder State */ 76 silk_nsq_state *NSQ, /* I/O NSQ state */ 77 SideInfoIndices *psIndices, /* I/O Quantization Indices */ 78 const opus_int16 x16[], /* I Input */ 79 opus_int8 pulses[], /* O Quantized pulse signal */ 80 const opus_int16 *PredCoef_Q12, /* I Short term prediction coefs */ 81 const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ 82 const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ 83 const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ 84 const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */ 85 const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */ 86 const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */ 87 const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */ 88 const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */ 89 const opus_int LTP_scale_Q14 /* I LTP state scaling */ 90 ) 91 { 92 opus_int k, lag, start_idx, LSF_interpolation_flag; 93 const opus_int16 *A_Q12, *B_Q14, *AR_shp_Q13; 94 opus_int16 *pxq; 95 VARDECL( opus_int32, sLTP_Q15 ); 96 VARDECL( opus_int16, sLTP ); 97 opus_int32 HarmShapeFIRPacked_Q14; 98 opus_int offset_Q10; 99 VARDECL( opus_int32, x_sc_Q10 ); 100 101 opus_int32 table[ 64 ][ 4 ]; 102 opus_int32 tmp1; 103 opus_int32 q1_Q10, q2_Q10, rd1_Q20, rd2_Q20; 104 105 #ifdef OPUS_CHECK_ASM 106 silk_nsq_state NSQ_c; 107 SideInfoIndices psIndices_c; 108 opus_int8 pulses_c[ MAX_FRAME_LENGTH ]; 109 const opus_int8 *const pulses_a = pulses; 110 #endif 111 112 SAVE_STACK; 113 114 #ifdef OPUS_CHECK_ASM 115 ( void )pulses_a; 116 silk_memcpy( &NSQ_c, NSQ, sizeof( NSQ_c ) ); 117 silk_memcpy( &psIndices_c, psIndices, sizeof( psIndices_c ) ); 118 silk_assert( psEncC->nb_subfr * psEncC->subfr_length <= MAX_FRAME_LENGTH ); 119 silk_memcpy( pulses_c, pulses, psEncC->nb_subfr * psEncC->subfr_length * sizeof( pulses[0] ) ); 120 121 silk_NSQ_c( 122 psEncC, 123 &NSQ_c, 124 &psIndices_c, 125 x16, 126 pulses_c, 127 PredCoef_Q12, 128 LTPCoef_Q14, 129 AR_Q13, 130 HarmShapeGain_Q14, 131 Tilt_Q14, 132 LF_shp_Q14, 133 Gains_Q16, 134 pitchL, 135 Lambda_Q10, 136 LTP_scale_Q14 137 ); 138 #endif 139 140 NSQ->rand_seed = psIndices->Seed; 141 142 /* Set unvoiced lag to the previous one, overwrite later for voiced */ 143 lag = NSQ->lagPrev; 144 145 silk_assert( NSQ->prev_gain_Q16 != 0 ); 146 147 offset_Q10 = silk_Quantization_Offsets_Q10[ psIndices->signalType >> 1 ][ psIndices->quantOffsetType ]; 148 149 /* 0 */ 150 q1_Q10 = offset_Q10; 151 q2_Q10 = offset_Q10 + ( 1024 - QUANT_LEVEL_ADJUST_Q10 ); 152 rd1_Q20 = q1_Q10 * Lambda_Q10; 153 rd2_Q20 = q2_Q10 * Lambda_Q10; 154 155 table[ 32 ][ 0 ] = q1_Q10; 156 table[ 32 ][ 1 ] = q2_Q10; 157 table[ 32 ][ 2 ] = 2 * (q1_Q10 - q2_Q10); 158 table[ 32 ][ 3 ] = (rd1_Q20 - rd2_Q20) + (q1_Q10 * q1_Q10 - q2_Q10 * q2_Q10); 159 160 /* -1 */ 161 q1_Q10 = offset_Q10 - ( 1024 - QUANT_LEVEL_ADJUST_Q10 ); 162 q2_Q10 = offset_Q10; 163 rd1_Q20 = - q1_Q10 * Lambda_Q10; 164 rd2_Q20 = q2_Q10 * Lambda_Q10; 165 166 table[ 31 ][ 0 ] = q1_Q10; 167 table[ 31 ][ 1 ] = q2_Q10; 168 table[ 31 ][ 2 ] = 2 * (q1_Q10 - q2_Q10); 169 table[ 31 ][ 3 ] = (rd1_Q20 - rd2_Q20) + (q1_Q10 * q1_Q10 - q2_Q10 * q2_Q10); 170 171 /* > 0 */ 172 for (k = 1; k <= 31; k++) 173 { 174 tmp1 = offset_Q10 + silk_LSHIFT( k, 10 ); 175 176 q1_Q10 = tmp1 - QUANT_LEVEL_ADJUST_Q10; 177 q2_Q10 = tmp1 - QUANT_LEVEL_ADJUST_Q10 + 1024; 178 rd1_Q20 = q1_Q10 * Lambda_Q10; 179 rd2_Q20 = q2_Q10 * Lambda_Q10; 180 181 table[ 32 + k ][ 0 ] = q1_Q10; 182 table[ 32 + k ][ 1 ] = q2_Q10; 183 table[ 32 + k ][ 2 ] = 2 * (q1_Q10 - q2_Q10); 184 table[ 32 + k ][ 3 ] = (rd1_Q20 - rd2_Q20) + (q1_Q10 * q1_Q10 - q2_Q10 * q2_Q10); 185 } 186 187 /* < -1 */ 188 for (k = -32; k <= -2; k++) 189 { 190 tmp1 = offset_Q10 + silk_LSHIFT( k, 10 ); 191 192 q1_Q10 = tmp1 + QUANT_LEVEL_ADJUST_Q10; 193 q2_Q10 = tmp1 + QUANT_LEVEL_ADJUST_Q10 + 1024; 194 rd1_Q20 = - q1_Q10 * Lambda_Q10; 195 rd2_Q20 = - q2_Q10 * Lambda_Q10; 196 197 table[ 32 + k ][ 0 ] = q1_Q10; 198 table[ 32 + k ][ 1 ] = q2_Q10; 199 table[ 32 + k ][ 2 ] = 2 * (q1_Q10 - q2_Q10); 200 table[ 32 + k ][ 3 ] = (rd1_Q20 - rd2_Q20) + (q1_Q10 * q1_Q10 - q2_Q10 * q2_Q10); 201 } 202 203 if( psIndices->NLSFInterpCoef_Q2 == 4 ) { 204 LSF_interpolation_flag = 0; 205 } else { 206 LSF_interpolation_flag = 1; 207 } 208 209 ALLOC( sLTP_Q15, psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 ); 210 ALLOC( sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16 ); 211 ALLOC( x_sc_Q10, psEncC->subfr_length, opus_int32 ); 212 /* Set up pointers to start of sub frame */ 213 NSQ->sLTP_shp_buf_idx = psEncC->ltp_mem_length; 214 NSQ->sLTP_buf_idx = psEncC->ltp_mem_length; 215 pxq = &NSQ->xq[ psEncC->ltp_mem_length ]; 216 for( k = 0; k < psEncC->nb_subfr; k++ ) { 217 A_Q12 = &PredCoef_Q12[ (( k >> 1 ) | ( 1 - LSF_interpolation_flag )) * MAX_LPC_ORDER ]; 218 B_Q14 = <PCoef_Q14[ k * LTP_ORDER ]; 219 AR_shp_Q13 = &AR_Q13[ k * MAX_SHAPE_LPC_ORDER ]; 220 221 /* Noise shape parameters */ 222 silk_assert( HarmShapeGain_Q14[ k ] >= 0 ); 223 HarmShapeFIRPacked_Q14 = silk_RSHIFT( HarmShapeGain_Q14[ k ], 2 ); 224 HarmShapeFIRPacked_Q14 |= silk_LSHIFT( (opus_int32)silk_RSHIFT( HarmShapeGain_Q14[ k ], 1 ), 16 ); 225 226 NSQ->rewhite_flag = 0; 227 if( psIndices->signalType == TYPE_VOICED ) { 228 /* Voiced */ 229 lag = pitchL[ k ]; 230 231 /* Re-whitening */ 232 if( ( k & ( 3 - silk_LSHIFT( LSF_interpolation_flag, 1 ) ) ) == 0 ) { 233 /* Rewhiten with new A coefs */ 234 start_idx = psEncC->ltp_mem_length - lag - psEncC->predictLPCOrder - LTP_ORDER / 2; 235 celt_assert( start_idx > 0 ); 236 237 silk_LPC_analysis_filter( &sLTP[ start_idx ], &NSQ->xq[ start_idx + k * psEncC->subfr_length ], 238 A_Q12, psEncC->ltp_mem_length - start_idx, psEncC->predictLPCOrder, psEncC->arch ); 239 240 NSQ->rewhite_flag = 1; 241 NSQ->sLTP_buf_idx = psEncC->ltp_mem_length; 242 } 243 } 244 245 silk_nsq_scale_states_sse4_1( psEncC, NSQ, x16, x_sc_Q10, sLTP, sLTP_Q15, k, LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType ); 246 247 if ( opus_likely( ( 10 == psEncC->shapingLPCOrder ) && ( 16 == psEncC->predictLPCOrder) ) ) 248 { 249 silk_noise_shape_quantizer_10_16_sse4_1( NSQ, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15, A_Q12, B_Q14, 250 AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ], Gains_Q16[ k ], Lambda_Q10, 251 offset_Q10, psEncC->subfr_length, &(table[32]) ); 252 } 253 else 254 { 255 silk_noise_shape_quantizer( NSQ, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15, A_Q12, B_Q14, 256 AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ], Gains_Q16[ k ], Lambda_Q10, 257 offset_Q10, psEncC->subfr_length, psEncC->shapingLPCOrder, psEncC->predictLPCOrder, psEncC->arch ); 258 } 259 260 x16 += psEncC->subfr_length; 261 pulses += psEncC->subfr_length; 262 pxq += psEncC->subfr_length; 263 } 264 265 /* Update lagPrev for next frame */ 266 NSQ->lagPrev = pitchL[ psEncC->nb_subfr - 1 ]; 267 268 /* Save quantized speech and noise shaping signals */ 269 silk_memmove( NSQ->xq, &NSQ->xq[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int16 ) ); 270 silk_memmove( NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int32 ) ); 271 272 #ifdef OPUS_CHECK_ASM 273 silk_assert( !memcmp( &NSQ_c, NSQ, sizeof( NSQ_c ) ) ); 274 silk_assert( !memcmp( &psIndices_c, psIndices, sizeof( psIndices_c ) ) ); 275 silk_assert( !memcmp( pulses_c, pulses_a, psEncC->nb_subfr * psEncC->subfr_length * sizeof( pulses[0] ) ) ); 276 #endif 277 278 RESTORE_STACK; 279 } 280 281 /************************************/ 282 /* silk_noise_shape_quantizer_10_16 */ 283 /************************************/ 284 static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1( 285 silk_nsq_state *NSQ, /* I/O NSQ state */ 286 opus_int signalType, /* I Signal type */ 287 const opus_int32 x_sc_Q10[], /* I */ 288 opus_int8 pulses[], /* O */ 289 opus_int16 xq[], /* O */ 290 opus_int32 sLTP_Q15[], /* I/O LTP state */ 291 const opus_int16 a_Q12[], /* I Short term prediction coefs */ 292 const opus_int16 b_Q14[], /* I Long term prediction coefs */ 293 const opus_int16 AR_shp_Q13[], /* I Noise shaping AR coefs */ 294 opus_int lag, /* I Pitch lag */ 295 opus_int32 HarmShapeFIRPacked_Q14, /* I */ 296 opus_int Tilt_Q14, /* I Spectral tilt */ 297 opus_int32 LF_shp_Q14, /* I */ 298 opus_int32 Gain_Q16, /* I */ 299 opus_int Lambda_Q10, /* I */ 300 opus_int offset_Q10, /* I */ 301 opus_int length, /* I Input length */ 302 opus_int32 table[][4] /* I */ 303 ) 304 { 305 opus_int i; 306 opus_int32 LTP_pred_Q13, LPC_pred_Q10, n_AR_Q12, n_LTP_Q13; 307 opus_int32 n_LF_Q12, r_Q10, q1_Q0, q1_Q10, q2_Q10; 308 opus_int32 exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10, sDiff_shp_Q14; 309 opus_int32 tmp1, tmp2, sLF_AR_shp_Q14; 310 opus_int32 *psLPC_Q14, *shp_lag_ptr, *pred_lag_ptr; 311 312 __m128i xmm_tempa, xmm_tempb; 313 314 __m128i xmm_one; 315 316 __m128i psLPC_Q14_hi_01234567, psLPC_Q14_hi_89ABCDEF; 317 __m128i psLPC_Q14_lo_01234567, psLPC_Q14_lo_89ABCDEF; 318 __m128i a_Q12_01234567, a_Q12_89ABCDEF; 319 320 __m128i sAR2_Q14_hi_76543210, sAR2_Q14_lo_76543210; 321 __m128i AR_shp_Q13_76543210; 322 323 int rdo_offset = (Lambda_Q10 >> 1) - 512; 324 325 shp_lag_ptr = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2 ]; 326 pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ]; 327 Gain_Q10 = silk_RSHIFT( Gain_Q16, 6 ); 328 329 /* Set up short term AR state */ 330 psLPC_Q14 = &NSQ->sLPC_Q14[ NSQ_LPC_BUF_LENGTH - 1 ]; 331 332 sLF_AR_shp_Q14 = NSQ->sLF_AR_shp_Q14; 333 xq_Q14 = psLPC_Q14[ 0 ]; 334 sDiff_shp_Q14 = NSQ->sDiff_shp_Q14; 335 LTP_pred_Q13 = 0; 336 337 /* load a_Q12 */ 338 xmm_one = _mm_set_epi8( 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 ); 339 340 /* load a_Q12[0] - a_Q12[7] */ 341 a_Q12_01234567 = _mm_loadu_si128( (__m128i *)(void*)(&a_Q12[ 0 ] ) ); 342 /* load a_Q12[ 8 ] - a_Q12[ 15 ] */ 343 a_Q12_89ABCDEF = _mm_loadu_si128( (__m128i *)(void*)(&a_Q12[ 8 ] ) ); 344 345 a_Q12_01234567 = _mm_shuffle_epi8( a_Q12_01234567, xmm_one ); 346 a_Q12_89ABCDEF = _mm_shuffle_epi8( a_Q12_89ABCDEF, xmm_one ); 347 348 /* load AR_shp_Q13 */ 349 AR_shp_Q13_76543210 = _mm_loadu_si128( (__m128i *)(void*)(&AR_shp_Q13[0] ) ); 350 351 /* load psLPC_Q14 */ 352 xmm_one = _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0 ); 353 354 xmm_tempa = _mm_loadu_si128( (__m128i *)(void*)(&psLPC_Q14[-16]) ); 355 xmm_tempb = _mm_loadu_si128( (__m128i *)(void*)(&psLPC_Q14[-12]) ); 356 357 xmm_tempa = _mm_shuffle_epi8( xmm_tempa, xmm_one ); 358 xmm_tempb = _mm_shuffle_epi8( xmm_tempb, xmm_one ); 359 360 psLPC_Q14_hi_89ABCDEF = _mm_unpackhi_epi64( xmm_tempa, xmm_tempb ); 361 psLPC_Q14_lo_89ABCDEF = _mm_unpacklo_epi64( xmm_tempa, xmm_tempb ); 362 363 xmm_tempa = _mm_loadu_si128( (__m128i *)(void*)(&psLPC_Q14[ -8 ]) ); 364 xmm_tempb = _mm_loadu_si128( (__m128i *)(void*)(&psLPC_Q14[ -4 ]) ); 365 366 xmm_tempa = _mm_shuffle_epi8( xmm_tempa, xmm_one ); 367 xmm_tempb = _mm_shuffle_epi8( xmm_tempb, xmm_one ); 368 369 psLPC_Q14_hi_01234567 = _mm_unpackhi_epi64( xmm_tempa, xmm_tempb ); 370 psLPC_Q14_lo_01234567 = _mm_unpacklo_epi64( xmm_tempa, xmm_tempb ); 371 372 /* load sAR2_Q14 */ 373 xmm_tempa = _mm_loadu_si128( (__m128i *)(void*)(&(NSQ->sAR2_Q14[ 0 ]) ) ); 374 xmm_tempb = _mm_loadu_si128( (__m128i *)(void*)(&(NSQ->sAR2_Q14[ 4 ]) ) ); 375 376 xmm_tempa = _mm_shuffle_epi8( xmm_tempa, xmm_one ); 377 xmm_tempb = _mm_shuffle_epi8( xmm_tempb, xmm_one ); 378 379 sAR2_Q14_hi_76543210 = _mm_unpackhi_epi64( xmm_tempa, xmm_tempb ); 380 sAR2_Q14_lo_76543210 = _mm_unpacklo_epi64( xmm_tempa, xmm_tempb ); 381 382 /* prepare 1 in 8 * 16bit */ 383 xmm_one = _mm_set1_epi16(1); 384 385 for( i = 0; i < length; i++ ) 386 { 387 /* Short-term prediction */ 388 __m128i xmm_hi_07, xmm_hi_8F, xmm_lo_07, xmm_lo_8F; 389 390 /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */ 391 LPC_pred_Q10 = 8; /* silk_RSHIFT( predictLPCOrder, 1 ); */ 392 393 /* shift psLPC_Q14 */ 394 psLPC_Q14_hi_89ABCDEF = _mm_alignr_epi8( psLPC_Q14_hi_01234567, psLPC_Q14_hi_89ABCDEF, 2 ); 395 psLPC_Q14_lo_89ABCDEF = _mm_alignr_epi8( psLPC_Q14_lo_01234567, psLPC_Q14_lo_89ABCDEF, 2 ); 396 397 psLPC_Q14_hi_01234567 = _mm_srli_si128( psLPC_Q14_hi_01234567, 2 ); 398 psLPC_Q14_lo_01234567 = _mm_srli_si128( psLPC_Q14_lo_01234567, 2 ); 399 400 psLPC_Q14_hi_01234567 = _mm_insert_epi16( psLPC_Q14_hi_01234567, (xq_Q14 >> 16), 7 ); 401 psLPC_Q14_lo_01234567 = _mm_insert_epi16( psLPC_Q14_lo_01234567, (xq_Q14), 7 ); 402 403 /* high part, use pmaddwd, results in 4 32-bit */ 404 xmm_hi_07 = _mm_madd_epi16( psLPC_Q14_hi_01234567, a_Q12_01234567 ); 405 xmm_hi_8F = _mm_madd_epi16( psLPC_Q14_hi_89ABCDEF, a_Q12_89ABCDEF ); 406 407 /* low part, use pmulhw, results in 8 16-bit, note we need simulate unsigned * signed, _mm_srai_epi16(psLPC_Q14_lo_01234567, 15) */ 408 xmm_tempa = _mm_cmpgt_epi16( _mm_setzero_si128(), psLPC_Q14_lo_01234567 ); 409 xmm_tempb = _mm_cmpgt_epi16( _mm_setzero_si128(), psLPC_Q14_lo_89ABCDEF ); 410 411 xmm_tempa = _mm_and_si128( xmm_tempa, a_Q12_01234567 ); 412 xmm_tempb = _mm_and_si128( xmm_tempb, a_Q12_89ABCDEF ); 413 414 xmm_lo_07 = _mm_mulhi_epi16( psLPC_Q14_lo_01234567, a_Q12_01234567 ); 415 xmm_lo_8F = _mm_mulhi_epi16( psLPC_Q14_lo_89ABCDEF, a_Q12_89ABCDEF ); 416 417 xmm_lo_07 = _mm_add_epi16( xmm_lo_07, xmm_tempa ); 418 xmm_lo_8F = _mm_add_epi16( xmm_lo_8F, xmm_tempb ); 419 420 xmm_lo_07 = _mm_madd_epi16( xmm_lo_07, xmm_one ); 421 xmm_lo_8F = _mm_madd_epi16( xmm_lo_8F, xmm_one ); 422 423 /* accumulate */ 424 xmm_hi_07 = _mm_add_epi32( xmm_hi_07, xmm_hi_8F ); 425 xmm_lo_07 = _mm_add_epi32( xmm_lo_07, xmm_lo_8F ); 426 427 xmm_hi_07 = _mm_add_epi32( xmm_hi_07, xmm_lo_07 ); 428 429 xmm_hi_07 = _mm_add_epi32( xmm_hi_07, _mm_unpackhi_epi64(xmm_hi_07, xmm_hi_07 ) ); 430 xmm_hi_07 = _mm_add_epi32( xmm_hi_07, _mm_shufflelo_epi16(xmm_hi_07, 0x0E ) ); 431 432 LPC_pred_Q10 += _mm_cvtsi128_si32( xmm_hi_07 ); 433 434 /* Long-term prediction */ 435 if ( opus_likely( signalType == TYPE_VOICED ) ) { 436 /* Unrolled loop */ 437 /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */ 438 LTP_pred_Q13 = 2; 439 { 440 __m128i b_Q14_3210, b_Q14_0123, pred_lag_ptr_0123; 441 442 b_Q14_3210 = OP_CVTEPI16_EPI32_M64( b_Q14 ); 443 b_Q14_0123 = _mm_shuffle_epi32( b_Q14_3210, 0x1B ); 444 445 /* loaded: [0] [-1] [-2] [-3] */ 446 pred_lag_ptr_0123 = _mm_loadu_si128( (__m128i *)(void*)(&pred_lag_ptr[ -3 ] ) ); 447 /* shuffle to [-3] [-2] [-1] [0] and to new xmm */ 448 xmm_tempa = _mm_shuffle_epi32( pred_lag_ptr_0123, 0x1B ); 449 /*64-bit multiply, a[2] * b[-2], a[0] * b[0] */ 450 xmm_tempa = _mm_mul_epi32( xmm_tempa, b_Q14_3210 ); 451 /* right shift 2 bytes (16 bits), zero extended */ 452 xmm_tempa = _mm_srli_si128( xmm_tempa, 2 ); 453 454 /* a[1] * b[-1], a[3] * b[-3] */ 455 pred_lag_ptr_0123 = _mm_mul_epi32( pred_lag_ptr_0123, b_Q14_0123 ); 456 pred_lag_ptr_0123 = _mm_srli_si128( pred_lag_ptr_0123, 2 ); 457 458 pred_lag_ptr_0123 = _mm_add_epi32( pred_lag_ptr_0123, xmm_tempa ); 459 /* equal shift right 8 bytes*/ 460 xmm_tempa = _mm_shuffle_epi32( pred_lag_ptr_0123, _MM_SHUFFLE( 0, 0, 3, 2 ) ); 461 xmm_tempa = _mm_add_epi32( xmm_tempa, pred_lag_ptr_0123 ); 462 463 LTP_pred_Q13 += _mm_cvtsi128_si32( xmm_tempa ); 464 465 LTP_pred_Q13 = silk_SMLAWB( LTP_pred_Q13, pred_lag_ptr[ -4 ], b_Q14[ 4 ] ); 466 pred_lag_ptr++; 467 } 468 } 469 470 /* Noise shape feedback */ 471 NSQ->sAR2_Q14[ 9 ] = NSQ->sAR2_Q14[ 8 ]; 472 NSQ->sAR2_Q14[ 8 ] = _mm_cvtsi128_si32( _mm_srli_si128(_mm_unpackhi_epi16( sAR2_Q14_lo_76543210, sAR2_Q14_hi_76543210 ), 12 ) ); 473 474 sAR2_Q14_hi_76543210 = _mm_slli_si128( sAR2_Q14_hi_76543210, 2 ); 475 sAR2_Q14_lo_76543210 = _mm_slli_si128( sAR2_Q14_lo_76543210, 2 ); 476 477 sAR2_Q14_hi_76543210 = _mm_insert_epi16( sAR2_Q14_hi_76543210, (sDiff_shp_Q14 >> 16), 0 ); 478 sAR2_Q14_lo_76543210 = _mm_insert_epi16( sAR2_Q14_lo_76543210, (sDiff_shp_Q14), 0 ); 479 480 /* high part, use pmaddwd, results in 4 32-bit */ 481 xmm_hi_07 = _mm_madd_epi16( sAR2_Q14_hi_76543210, AR_shp_Q13_76543210 ); 482 483 /* low part, use pmulhw, results in 8 16-bit, note we need simulate unsigned * signed,_mm_srai_epi16(sAR2_Q14_lo_76543210, 15) */ 484 xmm_tempa = _mm_cmpgt_epi16( _mm_setzero_si128(), sAR2_Q14_lo_76543210 ); 485 xmm_tempa = _mm_and_si128( xmm_tempa, AR_shp_Q13_76543210 ); 486 487 xmm_lo_07 = _mm_mulhi_epi16( sAR2_Q14_lo_76543210, AR_shp_Q13_76543210 ); 488 xmm_lo_07 = _mm_add_epi16( xmm_lo_07, xmm_tempa ); 489 490 xmm_lo_07 = _mm_madd_epi16( xmm_lo_07, xmm_one ); 491 492 /* accumulate */ 493 xmm_hi_07 = _mm_add_epi32( xmm_hi_07, xmm_lo_07 ); 494 495 xmm_hi_07 = _mm_add_epi32( xmm_hi_07, _mm_unpackhi_epi64(xmm_hi_07, xmm_hi_07 ) ); 496 xmm_hi_07 = _mm_add_epi32( xmm_hi_07, _mm_shufflelo_epi16(xmm_hi_07, 0x0E ) ); 497 498 n_AR_Q12 = 5 + _mm_cvtsi128_si32( xmm_hi_07 ); 499 500 n_AR_Q12 = silk_SMLAWB( n_AR_Q12, NSQ->sAR2_Q14[ 8 ], AR_shp_Q13[ 8 ] ); 501 n_AR_Q12 = silk_SMLAWB( n_AR_Q12, NSQ->sAR2_Q14[ 9 ], AR_shp_Q13[ 9 ] ); 502 503 n_AR_Q12 = silk_LSHIFT32( n_AR_Q12, 1 ); /* Q11 -> Q12 */ 504 n_AR_Q12 = silk_SMLAWB( n_AR_Q12, sLF_AR_shp_Q14, Tilt_Q14 ); 505 506 n_LF_Q12 = silk_SMULWB( NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - 1 ], LF_shp_Q14 ); 507 n_LF_Q12 = silk_SMLAWT( n_LF_Q12, sLF_AR_shp_Q14, LF_shp_Q14 ); 508 509 celt_assert( lag > 0 || signalType != TYPE_VOICED ); 510 511 /* Combine prediction and noise shaping signals */ 512 tmp1 = silk_SUB32( silk_LSHIFT32( LPC_pred_Q10, 2 ), n_AR_Q12 ); /* Q12 */ 513 tmp1 = silk_SUB32( tmp1, n_LF_Q12 ); /* Q12 */ 514 if( lag > 0 ) { 515 /* Symmetric, packed FIR coefficients */ 516 n_LTP_Q13 = silk_SMULWB( silk_ADD_SAT32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 ); 517 n_LTP_Q13 = silk_SMLAWT( n_LTP_Q13, shp_lag_ptr[ -1 ], HarmShapeFIRPacked_Q14 ); 518 n_LTP_Q13 = silk_LSHIFT( n_LTP_Q13, 1 ); 519 shp_lag_ptr++; 520 521 tmp2 = silk_SUB32( LTP_pred_Q13, n_LTP_Q13 ); /* Q13 */ 522 tmp1 = silk_ADD_LSHIFT32( tmp2, tmp1, 1 ); /* Q13 */ 523 tmp1 = silk_RSHIFT_ROUND( tmp1, 3 ); /* Q10 */ 524 } else { 525 tmp1 = silk_RSHIFT_ROUND( tmp1, 2 ); /* Q10 */ 526 } 527 528 r_Q10 = silk_SUB32( x_sc_Q10[ i ], tmp1 ); /* residual error Q10 */ 529 530 /* Generate dither */ 531 NSQ->rand_seed = silk_RAND( NSQ->rand_seed ); 532 533 /* Flip sign depending on dither */ 534 tmp2 = -r_Q10; 535 if ( NSQ->rand_seed < 0 ) r_Q10 = tmp2; 536 537 r_Q10 = silk_LIMIT_32( r_Q10, -(31 << 10), 30 << 10 ); 538 539 /* Find two quantization level candidates and measure their rate-distortion */ 540 q1_Q10 = silk_SUB32( r_Q10, offset_Q10 ); 541 q1_Q0 = silk_RSHIFT( q1_Q10, 10 ); 542 if (Lambda_Q10 > 2048) { 543 /* For aggressive RDO, the bias becomes more than one pulse. */ 544 if (q1_Q10 > rdo_offset) { 545 q1_Q0 = silk_RSHIFT( q1_Q10 - rdo_offset, 10 ); 546 } else if (q1_Q10 < -rdo_offset) { 547 q1_Q0 = silk_RSHIFT( q1_Q10 + rdo_offset, 10 ); 548 } else if (q1_Q10 < 0) { 549 q1_Q0 = -1; 550 } else { 551 q1_Q0 = 0; 552 } 553 } 554 555 q1_Q10 = table[q1_Q0][0]; 556 q2_Q10 = table[q1_Q0][1]; 557 558 if (r_Q10 * table[q1_Q0][2] - table[q1_Q0][3] < 0) 559 { 560 q1_Q10 = q2_Q10; 561 } 562 563 pulses[ i ] = (opus_int8)silk_RSHIFT_ROUND( q1_Q10, 10 ); 564 565 /* Excitation */ 566 exc_Q14 = silk_LSHIFT( q1_Q10, 4 ); 567 568 tmp2 = -exc_Q14; 569 if ( NSQ->rand_seed < 0 ) exc_Q14 = tmp2; 570 571 /* Add predictions */ 572 LPC_exc_Q14 = silk_ADD_LSHIFT32( exc_Q14, LTP_pred_Q13, 1 ); 573 xq_Q14 = silk_ADD_LSHIFT32( LPC_exc_Q14, LPC_pred_Q10, 4 ); 574 575 /* Update states */ 576 psLPC_Q14++; 577 *psLPC_Q14 = xq_Q14; 578 NSQ->sDiff_shp_Q14 = silk_SUB_LSHIFT32( xq_Q14, x_sc_Q10[ i ], 4 ); 579 sLF_AR_shp_Q14 = silk_SUB_LSHIFT32( NSQ->sDiff_shp_Q14, n_AR_Q12, 2 ); 580 581 NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx ] = silk_SUB_LSHIFT32( sLF_AR_shp_Q14, n_LF_Q12, 2 ); 582 sLTP_Q15[ NSQ->sLTP_buf_idx ] = silk_LSHIFT( LPC_exc_Q14, 1 ); 583 NSQ->sLTP_shp_buf_idx++; 584 NSQ->sLTP_buf_idx++; 585 586 /* Make dither dependent on quantized signal */ 587 NSQ->rand_seed = silk_ADD32_ovflw( NSQ->rand_seed, pulses[ i ] ); 588 } 589 590 NSQ->sLF_AR_shp_Q14 = sLF_AR_shp_Q14; 591 592 /* Scale XQ back to normal level before saving */ 593 psLPC_Q14 = &NSQ->sLPC_Q14[ NSQ_LPC_BUF_LENGTH ]; 594 595 /* write back sAR2_Q14 */ 596 xmm_tempa = _mm_unpackhi_epi16( sAR2_Q14_lo_76543210, sAR2_Q14_hi_76543210 ); 597 xmm_tempb = _mm_unpacklo_epi16( sAR2_Q14_lo_76543210, sAR2_Q14_hi_76543210 ); 598 _mm_storeu_si128( (__m128i *)(void*)(&NSQ->sAR2_Q14[ 4 ]), xmm_tempa ); 599 _mm_storeu_si128( (__m128i *)(void*)(&NSQ->sAR2_Q14[ 0 ]), xmm_tempb ); 600 601 /* xq[ i ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( silk_SMULWW( psLPC_Q14[ i ], Gain_Q10 ), 8 ) ); */ 602 { 603 __m128i xmm_Gain_Q10; 604 __m128i xmm_xq_Q14_3210, xmm_xq_Q14_x3x1, xmm_xq_Q14_7654, xmm_xq_Q14_x7x5; 605 606 /* prepare (1 << 7) in packed 4 32-bits */ 607 xmm_tempa = _mm_set1_epi32( (1 << 7) ); 608 609 /* prepare Gain_Q10 in packed 4 32-bits */ 610 xmm_Gain_Q10 = _mm_set1_epi32( Gain_Q10 ); 611 612 /* process xq */ 613 for (i = 0; i < length - 7; i += 8) 614 { 615 xmm_xq_Q14_3210 = _mm_loadu_si128( (__m128i *)(void*)(&(psLPC_Q14[ i + 0 ] ) ) ); 616 xmm_xq_Q14_7654 = _mm_loadu_si128( (__m128i *)(void*)(&(psLPC_Q14[ i + 4 ] ) ) ); 617 618 /* equal shift right 4 bytes*/ 619 xmm_xq_Q14_x3x1 = _mm_shuffle_epi32( xmm_xq_Q14_3210, _MM_SHUFFLE( 0, 3, 2, 1 ) ); 620 /* equal shift right 4 bytes*/ 621 xmm_xq_Q14_x7x5 = _mm_shuffle_epi32( xmm_xq_Q14_7654, _MM_SHUFFLE( 0, 3, 2, 1 ) ); 622 623 xmm_xq_Q14_3210 = _mm_mul_epi32( xmm_xq_Q14_3210, xmm_Gain_Q10 ); 624 xmm_xq_Q14_x3x1 = _mm_mul_epi32( xmm_xq_Q14_x3x1, xmm_Gain_Q10 ); 625 xmm_xq_Q14_7654 = _mm_mul_epi32( xmm_xq_Q14_7654, xmm_Gain_Q10 ); 626 xmm_xq_Q14_x7x5 = _mm_mul_epi32( xmm_xq_Q14_x7x5, xmm_Gain_Q10 ); 627 628 xmm_xq_Q14_3210 = _mm_srli_epi64( xmm_xq_Q14_3210, 16 ); 629 xmm_xq_Q14_x3x1 = _mm_slli_epi64( xmm_xq_Q14_x3x1, 16 ); 630 xmm_xq_Q14_7654 = _mm_srli_epi64( xmm_xq_Q14_7654, 16 ); 631 xmm_xq_Q14_x7x5 = _mm_slli_epi64( xmm_xq_Q14_x7x5, 16 ); 632 633 xmm_xq_Q14_3210 = _mm_blend_epi16( xmm_xq_Q14_3210, xmm_xq_Q14_x3x1, 0xCC ); 634 xmm_xq_Q14_7654 = _mm_blend_epi16( xmm_xq_Q14_7654, xmm_xq_Q14_x7x5, 0xCC ); 635 636 /* silk_RSHIFT_ROUND(xq, 8) */ 637 xmm_xq_Q14_3210 = _mm_add_epi32( xmm_xq_Q14_3210, xmm_tempa ); 638 xmm_xq_Q14_7654 = _mm_add_epi32( xmm_xq_Q14_7654, xmm_tempa ); 639 640 xmm_xq_Q14_3210 = _mm_srai_epi32( xmm_xq_Q14_3210, 8 ); 641 xmm_xq_Q14_7654 = _mm_srai_epi32( xmm_xq_Q14_7654, 8 ); 642 643 /* silk_SAT16 */ 644 xmm_xq_Q14_3210 = _mm_packs_epi32( xmm_xq_Q14_3210, xmm_xq_Q14_7654 ); 645 646 /* save to xq */ 647 _mm_storeu_si128( (__m128i *)(void*)(&xq[ i ] ), xmm_xq_Q14_3210 ); 648 } 649 } 650 for ( ; i < length; i++) 651 { 652 xq[i] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( silk_SMULWW( psLPC_Q14[ i ], Gain_Q10 ), 8 ) ); 653 } 654 655 /* Update LPC synth buffer */ 656 silk_memcpy( NSQ->sLPC_Q14, &NSQ->sLPC_Q14[ length ], NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) ); 657 } 658 659 static OPUS_INLINE void silk_nsq_scale_states_sse4_1( 660 const silk_encoder_state *psEncC, /* I Encoder State */ 661 silk_nsq_state *NSQ, /* I/O NSQ state */ 662 const opus_int16 x16[], /* I input */ 663 opus_int32 x_sc_Q10[], /* O input scaled with 1/Gain */ 664 const opus_int16 sLTP[], /* I re-whitened LTP state in Q0 */ 665 opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */ 666 opus_int subfr, /* I subframe number */ 667 const opus_int LTP_scale_Q14, /* I */ 668 const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I */ 669 const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lag */ 670 const opus_int signal_type /* I Signal type */ 671 ) 672 { 673 opus_int i, lag; 674 opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q26; 675 __m128i xmm_inv_gain_Q26, xmm_x16_x2x0, xmm_x16_x3x1; 676 677 lag = pitchL[ subfr ]; 678 inv_gain_Q31 = silk_INVERSE32_varQ( silk_max( Gains_Q16[ subfr ], 1 ), 47 ); 679 silk_assert( inv_gain_Q31 != 0 ); 680 681 /* Scale input */ 682 inv_gain_Q26 = silk_RSHIFT_ROUND( inv_gain_Q31, 5 ); 683 684 /* prepare inv_gain_Q26 in packed 4 32-bits */ 685 xmm_inv_gain_Q26 = _mm_set1_epi32(inv_gain_Q26); 686 687 for( i = 0; i < psEncC->subfr_length - 3; i += 4 ) { 688 xmm_x16_x2x0 = OP_CVTEPI16_EPI32_M64( &(x16[ i ] ) ); 689 690 /* equal shift right 4 bytes*/ 691 xmm_x16_x3x1 = _mm_shuffle_epi32( xmm_x16_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) ); 692 693 xmm_x16_x2x0 = _mm_mul_epi32( xmm_x16_x2x0, xmm_inv_gain_Q26 ); 694 xmm_x16_x3x1 = _mm_mul_epi32( xmm_x16_x3x1, xmm_inv_gain_Q26 ); 695 696 xmm_x16_x2x0 = _mm_srli_epi64( xmm_x16_x2x0, 16 ); 697 xmm_x16_x3x1 = _mm_slli_epi64( xmm_x16_x3x1, 16 ); 698 699 xmm_x16_x2x0 = _mm_blend_epi16( xmm_x16_x2x0, xmm_x16_x3x1, 0xCC ); 700 701 _mm_storeu_si128( (__m128i *)(void*)(&(x_sc_Q10[ i ] ) ), xmm_x16_x2x0 ); 702 } 703 704 for( ; i < psEncC->subfr_length; i++ ) { 705 x_sc_Q10[ i ] = silk_SMULWW( x16[ i ], inv_gain_Q26 ); 706 } 707 708 /* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */ 709 if( NSQ->rewhite_flag ) { 710 if( subfr == 0 ) { 711 /* Do LTP downscaling */ 712 inv_gain_Q31 = silk_LSHIFT( silk_SMULWB( inv_gain_Q31, LTP_scale_Q14 ), 2 ); 713 } 714 for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx; i++ ) { 715 silk_assert( i < MAX_FRAME_LENGTH ); 716 sLTP_Q15[ i ] = silk_SMULWB( inv_gain_Q31, sLTP[ i ] ); 717 } 718 } 719 720 /* Adjust for changing gain */ 721 if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) { 722 __m128i xmm_gain_adj_Q16, xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1; 723 gain_adj_Q16 = silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 ); 724 725 /* Scale long-term shaping state */ 726 727 /* prepare gain_adj_Q16 in packed 4 32-bits */ 728 xmm_gain_adj_Q16 = _mm_set1_epi32(gain_adj_Q16); 729 730 for( i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sLTP_shp_buf_idx - 3; i += 4 ) 731 { 732 xmm_sLTP_shp_Q14_x2x0 = _mm_loadu_si128( (__m128i *)(void*)(&(NSQ->sLTP_shp_Q14[ i ] ) ) ); 733 /* equal shift right 4 bytes*/ 734 xmm_sLTP_shp_Q14_x3x1 = _mm_shuffle_epi32( xmm_sLTP_shp_Q14_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) ); 735 736 xmm_sLTP_shp_Q14_x2x0 = _mm_mul_epi32( xmm_sLTP_shp_Q14_x2x0, xmm_gain_adj_Q16 ); 737 xmm_sLTP_shp_Q14_x3x1 = _mm_mul_epi32( xmm_sLTP_shp_Q14_x3x1, xmm_gain_adj_Q16 ); 738 739 xmm_sLTP_shp_Q14_x2x0 = _mm_srli_epi64( xmm_sLTP_shp_Q14_x2x0, 16 ); 740 xmm_sLTP_shp_Q14_x3x1 = _mm_slli_epi64( xmm_sLTP_shp_Q14_x3x1, 16 ); 741 742 xmm_sLTP_shp_Q14_x2x0 = _mm_blend_epi16( xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1, 0xCC ); 743 744 _mm_storeu_si128( (__m128i *)(void*)(&(NSQ->sLTP_shp_Q14[ i ] ) ), xmm_sLTP_shp_Q14_x2x0 ); 745 } 746 747 for( ; i < NSQ->sLTP_shp_buf_idx; i++ ) { 748 NSQ->sLTP_shp_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sLTP_shp_Q14[ i ] ); 749 } 750 751 /* Scale long-term prediction state */ 752 if( signal_type == TYPE_VOICED && NSQ->rewhite_flag == 0 ) { 753 for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx; i++ ) { 754 sLTP_Q15[ i ] = silk_SMULWW( gain_adj_Q16, sLTP_Q15[ i ] ); 755 } 756 } 757 758 NSQ->sLF_AR_shp_Q14 = silk_SMULWW( gain_adj_Q16, NSQ->sLF_AR_shp_Q14 ); 759 NSQ->sDiff_shp_Q14 = silk_SMULWW( gain_adj_Q16, NSQ->sDiff_shp_Q14 ); 760 761 /* Scale short-term prediction and shaping states */ 762 for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) { 763 NSQ->sLPC_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sLPC_Q14[ i ] ); 764 } 765 for( i = 0; i < MAX_SHAPE_LPC_ORDER; i++ ) { 766 NSQ->sAR2_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sAR2_Q14[ i ] ); 767 } 768 769 /* Save inverse gain */ 770 NSQ->prev_gain_Q16 = Gains_Q16[ subfr ]; 771 } 772 }