NSQ_del_dec_sse4_1.c (48765B)
1 /* Copyright (c) 2014-2020, Cisco Systems, INC 2 Written by XiangMingZhu WeiZhou MinPeng YanWang FrancisQuiers 3 4 Redistribution and use in source and binary forms, with or without 5 modification, are permitted provided that the following conditions 6 are met: 7 8 - Redistributions of source code must retain the above copyright 9 notice, this list of conditions and the following disclaimer. 10 11 - Redistributions in binary form must reproduce the above copyright 12 notice, this list of conditions and the following disclaimer in the 13 documentation and/or other materials provided with the distribution. 14 15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 16 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER 19 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 23 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 24 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28 #ifdef HAVE_CONFIG_H 29 #include "config.h" 30 #endif 31 32 #include <xmmintrin.h> 33 #include <emmintrin.h> 34 #include <smmintrin.h> 35 #include "main.h" 36 #include "celt/x86/x86cpu.h" 37 38 #include "stack_alloc.h" 39 40 typedef struct { 41 opus_int32 sLPC_Q14[ MAX_SUB_FRAME_LENGTH + NSQ_LPC_BUF_LENGTH ]; 42 opus_int32 RandState[ DECISION_DELAY ]; 43 opus_int32 Q_Q10[ DECISION_DELAY ]; 44 opus_int32 Xq_Q14[ DECISION_DELAY ]; 45 opus_int32 Pred_Q15[ DECISION_DELAY ]; 46 opus_int32 Shape_Q14[ DECISION_DELAY ]; 47 opus_int32 sAR2_Q14[ MAX_SHAPE_LPC_ORDER ]; 48 opus_int32 LF_AR_Q14; 49 opus_int32 Diff_Q14; 50 opus_int32 Seed; 51 opus_int32 SeedInit; 52 opus_int32 RD_Q10; 53 } NSQ_del_dec_struct; 54 55 typedef struct { 56 opus_int32 Q_Q10; 57 opus_int32 RD_Q10; 58 opus_int32 xq_Q14; 59 opus_int32 LF_AR_Q14; 60 opus_int32 Diff_Q14; 61 opus_int32 sLTP_shp_Q14; 62 opus_int32 LPC_exc_Q14; 63 } NSQ_sample_struct; 64 65 typedef NSQ_sample_struct NSQ_sample_pair[ 2 ]; 66 67 static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1( 68 const silk_encoder_state *psEncC, /* I Encoder State */ 69 silk_nsq_state *NSQ, /* I/O NSQ state */ 70 NSQ_del_dec_struct psDelDec[], /* I/O Delayed decision states */ 71 const opus_int16 x16[], /* I Input */ 72 opus_int32 x_sc_Q10[], /* O Input scaled with 1/Gain in Q10 */ 73 const opus_int16 sLTP[], /* I Re-whitened LTP state in Q0 */ 74 opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */ 75 opus_int subfr, /* I Subframe number */ 76 opus_int nStatesDelayedDecision, /* I Number of del dec states */ 77 const opus_int LTP_scale_Q14, /* I LTP state scaling */ 78 const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I */ 79 const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lag */ 80 const opus_int signal_type, /* I Signal type */ 81 const opus_int decisionDelay /* I Decision delay */ 82 ); 83 84 /******************************************/ 85 /* Noise shape quantizer for one subframe */ 86 /******************************************/ 87 static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1( 88 silk_nsq_state *NSQ, /* I/O NSQ state */ 89 NSQ_del_dec_struct psDelDec[], /* I/O Delayed decision states */ 90 opus_int signalType, /* I Signal type */ 91 const opus_int32 x_Q10[], /* I */ 92 opus_int8 pulses[], /* O */ 93 opus_int16 xq[], /* O */ 94 opus_int32 sLTP_Q15[], /* I/O LTP filter state */ 95 opus_int32 delayedGain_Q10[], /* I/O Gain delay buffer */ 96 const opus_int16 a_Q12[], /* I Short term prediction coefs */ 97 const opus_int16 b_Q14[], /* I Long term prediction coefs */ 98 const opus_int16 AR_shp_Q13[], /* I Noise shaping coefs */ 99 opus_int lag, /* I Pitch lag */ 100 opus_int32 HarmShapeFIRPacked_Q14, /* I */ 101 opus_int Tilt_Q14, /* I Spectral tilt */ 102 opus_int32 LF_shp_Q14, /* I */ 103 opus_int32 Gain_Q16, /* I */ 104 opus_int Lambda_Q10, /* I */ 105 opus_int offset_Q10, /* I */ 106 opus_int length, /* I Input length */ 107 opus_int subfr, /* I Subframe number */ 108 opus_int shapingLPCOrder, /* I Shaping LPC filter order */ 109 opus_int predictLPCOrder, /* I Prediction filter order */ 110 opus_int warping_Q16, /* I */ 111 opus_int nStatesDelayedDecision, /* I Number of states in decision tree */ 112 opus_int *smpl_buf_idx, /* I/O Index to newest samples in buffers */ 113 opus_int decisionDelay /* I */ 114 ); 115 116 void silk_NSQ_del_dec_sse4_1( 117 const silk_encoder_state *psEncC, /* I Encoder State */ 118 silk_nsq_state *NSQ, /* I/O NSQ state */ 119 SideInfoIndices *psIndices, /* I/O Quantization Indices */ 120 const opus_int16 x16[], /* I Input */ 121 opus_int8 pulses[], /* O Quantized pulse signal */ 122 const opus_int16 *PredCoef_Q12, /* I Short term prediction coefs */ 123 const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */ 124 const opus_int16 AR_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */ 125 const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */ 126 const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */ 127 const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */ 128 const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */ 129 const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */ 130 const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */ 131 const opus_int LTP_scale_Q14 /* I LTP state scaling */ 132 ) 133 { 134 opus_int i, k, lag, start_idx, LSF_interpolation_flag, Winner_ind, subfr; 135 opus_int last_smple_idx, smpl_buf_idx, decisionDelay; 136 const opus_int16 *A_Q12, *B_Q14, *AR_shp_Q13; 137 opus_int16 *pxq; 138 VARDECL( opus_int32, sLTP_Q15 ); 139 VARDECL( opus_int16, sLTP ); 140 opus_int32 HarmShapeFIRPacked_Q14; 141 opus_int offset_Q10; 142 opus_int32 RDmin_Q10, Gain_Q10; 143 VARDECL( opus_int32, x_sc_Q10 ); 144 VARDECL( opus_int32, delayedGain_Q10 ); 145 VARDECL( NSQ_del_dec_struct, psDelDec ); 146 NSQ_del_dec_struct *psDD; 147 #ifdef OPUS_CHECK_ASM 148 silk_nsq_state NSQ_c; 149 SideInfoIndices psIndices_c; 150 opus_int8 pulses_c[ MAX_FRAME_LENGTH ]; 151 const opus_int8 *const pulses_a = pulses; 152 #endif 153 SAVE_STACK; 154 155 #ifdef OPUS_CHECK_ASM 156 ( void )pulses_a; 157 silk_memcpy( &NSQ_c, NSQ, sizeof( NSQ_c ) ); 158 silk_memcpy( &psIndices_c, psIndices, sizeof( psIndices_c ) ); 159 silk_assert( psEncC->nb_subfr * psEncC->subfr_length <= MAX_FRAME_LENGTH ); 160 silk_memcpy( pulses_c, pulses, psEncC->nb_subfr * psEncC->subfr_length * sizeof( pulses[0] ) ); 161 silk_NSQ_del_dec_c( 162 psEncC, 163 &NSQ_c, 164 &psIndices_c, 165 x16, 166 pulses_c, 167 PredCoef_Q12, 168 LTPCoef_Q14, 169 AR_Q13, 170 HarmShapeGain_Q14, 171 Tilt_Q14, 172 LF_shp_Q14, 173 Gains_Q16, 174 pitchL, 175 Lambda_Q10, 176 LTP_scale_Q14 177 ); 178 #endif 179 180 /* Set unvoiced lag to the previous one, overwrite later for voiced */ 181 lag = NSQ->lagPrev; 182 183 silk_assert( NSQ->prev_gain_Q16 != 0 ); 184 185 /* Initialize delayed decision states */ 186 ALLOC( psDelDec, psEncC->nStatesDelayedDecision, NSQ_del_dec_struct ); 187 silk_memset( psDelDec, 0, psEncC->nStatesDelayedDecision * sizeof( NSQ_del_dec_struct ) ); 188 for( k = 0; k < psEncC->nStatesDelayedDecision; k++ ) { 189 psDD = &psDelDec[ k ]; 190 psDD->Seed = ( k + psIndices->Seed ) & 3; 191 psDD->SeedInit = psDD->Seed; 192 psDD->RD_Q10 = 0; 193 psDD->LF_AR_Q14 = NSQ->sLF_AR_shp_Q14; 194 psDD->Diff_Q14 = NSQ->sDiff_shp_Q14; 195 psDD->Shape_Q14[ 0 ] = NSQ->sLTP_shp_Q14[ psEncC->ltp_mem_length - 1 ]; 196 silk_memcpy( psDD->sLPC_Q14, NSQ->sLPC_Q14, NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) ); 197 silk_memcpy( psDD->sAR2_Q14, NSQ->sAR2_Q14, sizeof( NSQ->sAR2_Q14 ) ); 198 } 199 200 offset_Q10 = silk_Quantization_Offsets_Q10[ psIndices->signalType >> 1 ][ psIndices->quantOffsetType ]; 201 smpl_buf_idx = 0; /* index of oldest samples */ 202 203 decisionDelay = silk_min_int( DECISION_DELAY, psEncC->subfr_length ); 204 205 /* For voiced frames limit the decision delay to lower than the pitch lag */ 206 if( psIndices->signalType == TYPE_VOICED ) { 207 for( k = 0; k < psEncC->nb_subfr; k++ ) { 208 decisionDelay = silk_min_int( decisionDelay, pitchL[ k ] - LTP_ORDER / 2 - 1 ); 209 } 210 } else { 211 if( lag > 0 ) { 212 decisionDelay = silk_min_int( decisionDelay, lag - LTP_ORDER / 2 - 1 ); 213 } 214 } 215 216 if( psIndices->NLSFInterpCoef_Q2 == 4 ) { 217 LSF_interpolation_flag = 0; 218 } else { 219 LSF_interpolation_flag = 1; 220 } 221 222 ALLOC( sLTP_Q15, psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 ); 223 ALLOC( sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16 ); 224 ALLOC( x_sc_Q10, psEncC->subfr_length, opus_int32 ); 225 ALLOC( delayedGain_Q10, DECISION_DELAY, opus_int32 ); 226 /* Set up pointers to start of sub frame */ 227 pxq = &NSQ->xq[ psEncC->ltp_mem_length ]; 228 NSQ->sLTP_shp_buf_idx = psEncC->ltp_mem_length; 229 NSQ->sLTP_buf_idx = psEncC->ltp_mem_length; 230 subfr = 0; 231 for( k = 0; k < psEncC->nb_subfr; k++ ) { 232 A_Q12 = &PredCoef_Q12[ ( ( k >> 1 ) | ( 1 - LSF_interpolation_flag ) ) * MAX_LPC_ORDER ]; 233 B_Q14 = <PCoef_Q14[ k * LTP_ORDER ]; 234 AR_shp_Q13 = &AR_Q13[ k * MAX_SHAPE_LPC_ORDER ]; 235 236 /* Noise shape parameters */ 237 silk_assert( HarmShapeGain_Q14[ k ] >= 0 ); 238 HarmShapeFIRPacked_Q14 = silk_RSHIFT( HarmShapeGain_Q14[ k ], 2 ); 239 HarmShapeFIRPacked_Q14 |= silk_LSHIFT( (opus_int32)silk_RSHIFT( HarmShapeGain_Q14[ k ], 1 ), 16 ); 240 241 NSQ->rewhite_flag = 0; 242 if( psIndices->signalType == TYPE_VOICED ) { 243 /* Voiced */ 244 lag = pitchL[ k ]; 245 246 /* Re-whitening */ 247 if( ( k & ( 3 - silk_LSHIFT( LSF_interpolation_flag, 1 ) ) ) == 0 ) { 248 if( k == 2 ) { 249 /* RESET DELAYED DECISIONS */ 250 /* Find winner */ 251 RDmin_Q10 = psDelDec[ 0 ].RD_Q10; 252 Winner_ind = 0; 253 for( i = 1; i < psEncC->nStatesDelayedDecision; i++ ) { 254 if( psDelDec[ i ].RD_Q10 < RDmin_Q10 ) { 255 RDmin_Q10 = psDelDec[ i ].RD_Q10; 256 Winner_ind = i; 257 } 258 } 259 for( i = 0; i < psEncC->nStatesDelayedDecision; i++ ) { 260 if( i != Winner_ind ) { 261 psDelDec[ i ].RD_Q10 += ( silk_int32_MAX >> 4 ); 262 silk_assert( psDelDec[ i ].RD_Q10 >= 0 ); 263 } 264 } 265 266 /* Copy final part of signals from winner state to output and long-term filter states */ 267 psDD = &psDelDec[ Winner_ind ]; 268 last_smple_idx = smpl_buf_idx + decisionDelay; 269 for( i = 0; i < decisionDelay; i++ ) { 270 last_smple_idx = ( last_smple_idx - 1 ) % DECISION_DELAY; 271 if( last_smple_idx < 0 ) last_smple_idx += DECISION_DELAY; 272 pulses[ i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDD->Q_Q10[ last_smple_idx ], 10 ); 273 pxq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( 274 silk_SMULWW( psDD->Xq_Q14[ last_smple_idx ], Gains_Q16[ 1 ] ), 14 ) ); 275 NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay + i ] = psDD->Shape_Q14[ last_smple_idx ]; 276 } 277 278 subfr = 0; 279 } 280 281 /* Rewhiten with new A coefs */ 282 start_idx = psEncC->ltp_mem_length - lag - psEncC->predictLPCOrder - LTP_ORDER / 2; 283 celt_assert( start_idx > 0 ); 284 285 silk_LPC_analysis_filter( &sLTP[ start_idx ], &NSQ->xq[ start_idx + k * psEncC->subfr_length ], 286 A_Q12, psEncC->ltp_mem_length - start_idx, psEncC->predictLPCOrder, psEncC->arch ); 287 288 NSQ->sLTP_buf_idx = psEncC->ltp_mem_length; 289 NSQ->rewhite_flag = 1; 290 } 291 } 292 293 silk_nsq_del_dec_scale_states_sse4_1( psEncC, NSQ, psDelDec, x16, x_sc_Q10, sLTP, sLTP_Q15, k, 294 psEncC->nStatesDelayedDecision, LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType, decisionDelay ); 295 296 silk_noise_shape_quantizer_del_dec_sse4_1( NSQ, psDelDec, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15, 297 delayedGain_Q10, A_Q12, B_Q14, AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ], 298 Gains_Q16[ k ], Lambda_Q10, offset_Q10, psEncC->subfr_length, subfr++, psEncC->shapingLPCOrder, 299 psEncC->predictLPCOrder, psEncC->warping_Q16, psEncC->nStatesDelayedDecision, &smpl_buf_idx, decisionDelay ); 300 301 x16 += psEncC->subfr_length; 302 pulses += psEncC->subfr_length; 303 pxq += psEncC->subfr_length; 304 } 305 306 /* Find winner */ 307 RDmin_Q10 = psDelDec[ 0 ].RD_Q10; 308 Winner_ind = 0; 309 for( k = 1; k < psEncC->nStatesDelayedDecision; k++ ) { 310 if( psDelDec[ k ].RD_Q10 < RDmin_Q10 ) { 311 RDmin_Q10 = psDelDec[ k ].RD_Q10; 312 Winner_ind = k; 313 } 314 } 315 316 /* Copy final part of signals from winner state to output and long-term filter states */ 317 psDD = &psDelDec[ Winner_ind ]; 318 psIndices->Seed = psDD->SeedInit; 319 last_smple_idx = smpl_buf_idx + decisionDelay; 320 Gain_Q10 = silk_RSHIFT32( Gains_Q16[ psEncC->nb_subfr - 1 ], 6 ); 321 for( i = 0; i < decisionDelay; i++ ) { 322 last_smple_idx = ( last_smple_idx - 1 ) % DECISION_DELAY; 323 if( last_smple_idx < 0 ) last_smple_idx += DECISION_DELAY; 324 325 pulses[ i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDD->Q_Q10[ last_smple_idx ], 10 ); 326 pxq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( 327 silk_SMULWW( psDD->Xq_Q14[ last_smple_idx ], Gain_Q10 ), 8 ) ); 328 NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay + i ] = psDD->Shape_Q14[ last_smple_idx ]; 329 } 330 silk_memcpy( NSQ->sLPC_Q14, &psDD->sLPC_Q14[ psEncC->subfr_length ], NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) ); 331 silk_memcpy( NSQ->sAR2_Q14, psDD->sAR2_Q14, sizeof( psDD->sAR2_Q14 ) ); 332 333 /* Update states */ 334 NSQ->sLF_AR_shp_Q14 = psDD->LF_AR_Q14; 335 NSQ->sDiff_shp_Q14 = psDD->Diff_Q14; 336 NSQ->lagPrev = pitchL[ psEncC->nb_subfr - 1 ]; 337 338 /* Save quantized speech signal */ 339 silk_memmove( NSQ->xq, &NSQ->xq[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int16 ) ); 340 silk_memmove( NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int32 ) ); 341 342 #ifdef OPUS_CHECK_ASM 343 silk_assert( !memcmp( &NSQ_c, NSQ, sizeof( NSQ_c ) ) ); 344 silk_assert( !memcmp( &psIndices_c, psIndices, sizeof( psIndices_c ) ) ); 345 silk_assert( !memcmp( pulses_c, pulses_a, psEncC->nb_subfr * psEncC->subfr_length * sizeof( pulses[0] ) ) ); 346 #endif 347 348 RESTORE_STACK; 349 } 350 351 /******************************************/ 352 /* Noise shape quantizer for one subframe */ 353 /******************************************/ 354 static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1( 355 silk_nsq_state *NSQ, /* I/O NSQ state */ 356 NSQ_del_dec_struct psDelDec[], /* I/O Delayed decision states */ 357 opus_int signalType, /* I Signal type */ 358 const opus_int32 x_Q10[], /* I */ 359 opus_int8 pulses[], /* O */ 360 opus_int16 xq[], /* O */ 361 opus_int32 sLTP_Q15[], /* I/O LTP filter state */ 362 opus_int32 delayedGain_Q10[], /* I/O Gain delay buffer */ 363 const opus_int16 a_Q12[], /* I Short term prediction coefs */ 364 const opus_int16 b_Q14[], /* I Long term prediction coefs */ 365 const opus_int16 AR_shp_Q13[], /* I Noise shaping coefs */ 366 opus_int lag, /* I Pitch lag */ 367 opus_int32 HarmShapeFIRPacked_Q14, /* I */ 368 opus_int Tilt_Q14, /* I Spectral tilt */ 369 opus_int32 LF_shp_Q14, /* I */ 370 opus_int32 Gain_Q16, /* I */ 371 opus_int Lambda_Q10, /* I */ 372 opus_int offset_Q10, /* I */ 373 opus_int length, /* I Input length */ 374 opus_int subfr, /* I Subframe number */ 375 opus_int shapingLPCOrder, /* I Shaping LPC filter order */ 376 opus_int predictLPCOrder, /* I Prediction filter order */ 377 opus_int warping_Q16, /* I */ 378 opus_int nStatesDelayedDecision, /* I Number of states in decision tree */ 379 opus_int *smpl_buf_idx, /* I/O Index to newest samples in buffers */ 380 opus_int decisionDelay /* I */ 381 ) 382 { 383 opus_int i, j, k, Winner_ind, RDmin_ind, RDmax_ind, last_smple_idx; 384 opus_int32 Winner_rand_state; 385 opus_int32 LTP_pred_Q14, LPC_pred_Q14, n_AR_Q14, n_LTP_Q14; 386 opus_int32 n_LF_Q14, r_Q10, rr_Q10, rd1_Q10, rd2_Q10, RDmin_Q10, RDmax_Q10; 387 opus_int32 q1_Q0, q1_Q10, q2_Q10, exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10; 388 opus_int32 tmp1, tmp2, sLF_AR_shp_Q14; 389 opus_int32 *pred_lag_ptr, *shp_lag_ptr, *psLPC_Q14; 390 int rdo_offset; 391 392 VARDECL( NSQ_sample_pair, psSampleState ); 393 NSQ_del_dec_struct *psDD; 394 NSQ_sample_struct *psSS; 395 396 __m128i a_Q12_0123, a_Q12_4567, a_Q12_89AB, a_Q12_CDEF; 397 __m128i b_Q12_0123, b_sr_Q12_0123; 398 SAVE_STACK; 399 400 celt_assert( nStatesDelayedDecision > 0 ); 401 ALLOC( psSampleState, nStatesDelayedDecision, NSQ_sample_pair ); 402 403 rdo_offset = (Lambda_Q10 >> 1) - 512; 404 405 shp_lag_ptr = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2 ]; 406 pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ]; 407 Gain_Q10 = silk_RSHIFT( Gain_Q16, 6 ); 408 409 a_Q12_0123 = OP_CVTEPI16_EPI32_M64( a_Q12 ); 410 a_Q12_4567 = OP_CVTEPI16_EPI32_M64( a_Q12 + 4 ); 411 412 if( opus_likely( predictLPCOrder == 16 ) ) { 413 a_Q12_89AB = OP_CVTEPI16_EPI32_M64( a_Q12 + 8 ); 414 a_Q12_CDEF = OP_CVTEPI16_EPI32_M64( a_Q12 + 12 ); 415 } 416 417 if( signalType == TYPE_VOICED ){ 418 b_Q12_0123 = OP_CVTEPI16_EPI32_M64( b_Q14 ); 419 b_sr_Q12_0123 = _mm_shuffle_epi32( b_Q12_0123, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* equal shift right 4 bytes */ 420 } 421 for( i = 0; i < length; i++ ) { 422 /* Perform common calculations used in all states */ 423 424 /* Long-term prediction */ 425 if( signalType == TYPE_VOICED ) { 426 /* Unrolled loop */ 427 /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */ 428 LTP_pred_Q14 = 2; 429 { 430 __m128i tmpa, tmpb, pred_lag_ptr_tmp; 431 pred_lag_ptr_tmp = _mm_loadu_si128( (__m128i *)(void*)(&pred_lag_ptr[ -3 ] ) ); 432 pred_lag_ptr_tmp = _mm_shuffle_epi32( pred_lag_ptr_tmp, 0x1B ); 433 tmpa = _mm_mul_epi32( pred_lag_ptr_tmp, b_Q12_0123 ); 434 tmpa = _mm_srli_si128( tmpa, 2 ); 435 436 pred_lag_ptr_tmp = _mm_shuffle_epi32( pred_lag_ptr_tmp, _MM_SHUFFLE( 0, 3, 2, 1 ) );/* equal shift right 4 bytes */ 437 pred_lag_ptr_tmp = _mm_mul_epi32( pred_lag_ptr_tmp, b_sr_Q12_0123 ); 438 pred_lag_ptr_tmp = _mm_srli_si128( pred_lag_ptr_tmp, 2 ); 439 pred_lag_ptr_tmp = _mm_add_epi32( pred_lag_ptr_tmp, tmpa ); 440 441 tmpb = _mm_shuffle_epi32( pred_lag_ptr_tmp, _MM_SHUFFLE( 0, 0, 3, 2 ) );/* equal shift right 8 bytes */ 442 pred_lag_ptr_tmp = _mm_add_epi32( pred_lag_ptr_tmp, tmpb ); 443 LTP_pred_Q14 += _mm_cvtsi128_si32( pred_lag_ptr_tmp ); 444 445 LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[ -4 ], b_Q14[ 4 ] ); 446 LTP_pred_Q14 = silk_LSHIFT( LTP_pred_Q14, 1 ); /* Q13 -> Q14 */ 447 pred_lag_ptr++; 448 } 449 } else { 450 LTP_pred_Q14 = 0; 451 } 452 453 /* Long-term shaping */ 454 if( lag > 0 ) { 455 /* Symmetric, packed FIR coefficients */ 456 n_LTP_Q14 = silk_SMULWB( silk_ADD_SAT32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 ); 457 n_LTP_Q14 = silk_SMLAWT( n_LTP_Q14, shp_lag_ptr[ -1 ], HarmShapeFIRPacked_Q14 ); 458 n_LTP_Q14 = silk_SUB_LSHIFT32( LTP_pred_Q14, n_LTP_Q14, 2 ); /* Q12 -> Q14 */ 459 shp_lag_ptr++; 460 } else { 461 n_LTP_Q14 = 0; 462 } 463 { 464 __m128i tmpa, tmpb, psLPC_Q14_tmp, a_Q12_tmp; 465 466 for( k = 0; k < nStatesDelayedDecision; k++ ) { 467 /* Delayed decision state */ 468 psDD = &psDelDec[ k ]; 469 470 /* Sample state */ 471 psSS = psSampleState[ k ]; 472 473 /* Generate dither */ 474 psDD->Seed = silk_RAND( psDD->Seed ); 475 476 /* Pointer used in short term prediction and shaping */ 477 psLPC_Q14 = &psDD->sLPC_Q14[ NSQ_LPC_BUF_LENGTH - 1 + i ]; 478 /* Short-term prediction */ 479 silk_assert( predictLPCOrder == 10 || predictLPCOrder == 16 ); 480 /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */ 481 LPC_pred_Q14 = silk_RSHIFT( predictLPCOrder, 1 ); 482 483 tmpb = _mm_setzero_si128(); 484 485 /* step 1 */ 486 psLPC_Q14_tmp = _mm_loadu_si128( (__m128i *)(void*)(&psLPC_Q14[ -3 ] ) ); /* -3, -2 , -1, 0 */ 487 psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B ); /* 0, -1, -2, -3 */ 488 tmpa = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_0123 ); /* 0, -1, -2, -3 * 0123 -> 0*0, 2*-2 */ 489 490 tmpa = _mm_srli_epi64( tmpa, 16 ); 491 tmpb = _mm_add_epi32( tmpb, tmpa ); 492 493 psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* equal shift right 4 bytes */ 494 a_Q12_tmp = _mm_shuffle_epi32( a_Q12_0123, _MM_SHUFFLE(0, 3, 2, 1 ) ); /* equal shift right 4 bytes */ 495 psLPC_Q14_tmp = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_tmp ); /* 1*-1, 3*-3 */ 496 psLPC_Q14_tmp = _mm_srli_epi64( psLPC_Q14_tmp, 16 ); 497 tmpb = _mm_add_epi32( tmpb, psLPC_Q14_tmp ); 498 499 /* step 2 */ 500 psLPC_Q14_tmp = _mm_loadu_si128( (__m128i *)(void*)(&psLPC_Q14[ -7 ] ) ); 501 psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B ); 502 tmpa = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_4567 ); 503 tmpa = _mm_srli_epi64( tmpa, 16 ); 504 tmpb = _mm_add_epi32( tmpb, tmpa ); 505 506 psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* equal shift right 4 bytes */ 507 a_Q12_tmp = _mm_shuffle_epi32( a_Q12_4567, _MM_SHUFFLE(0, 3, 2, 1 ) ); /* equal shift right 4 bytes */ 508 psLPC_Q14_tmp = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_tmp ); 509 psLPC_Q14_tmp = _mm_srli_epi64( psLPC_Q14_tmp, 16 ); 510 tmpb = _mm_add_epi32( tmpb, psLPC_Q14_tmp ); 511 512 if ( opus_likely( predictLPCOrder == 16 ) ) 513 { 514 /* step 3 */ 515 psLPC_Q14_tmp = _mm_loadu_si128( (__m128i *)(void*)(&psLPC_Q14[ -11 ] ) ); 516 psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B ); 517 tmpa = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_89AB ); 518 tmpa = _mm_srli_epi64( tmpa, 16 ); 519 tmpb = _mm_add_epi32( tmpb, tmpa ); 520 521 psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* equal shift right 4 bytes */ 522 a_Q12_tmp = _mm_shuffle_epi32( a_Q12_89AB, _MM_SHUFFLE(0, 3, 2, 1 ) );/* equal shift right 4 bytes */ 523 psLPC_Q14_tmp = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_tmp ); 524 psLPC_Q14_tmp = _mm_srli_epi64( psLPC_Q14_tmp, 16 ); 525 tmpb = _mm_add_epi32( tmpb, psLPC_Q14_tmp ); 526 527 /* step 4 */ 528 psLPC_Q14_tmp = _mm_loadu_si128( (__m128i *)(void*)(&psLPC_Q14[ -15 ] ) ); 529 psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B ); 530 tmpa = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_CDEF ); 531 tmpa = _mm_srli_epi64( tmpa, 16 ); 532 tmpb = _mm_add_epi32( tmpb, tmpa ); 533 534 psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* equal shift right 4 bytes */ 535 a_Q12_tmp = _mm_shuffle_epi32( a_Q12_CDEF, _MM_SHUFFLE(0, 3, 2, 1 ) ); /* equal shift right 4 bytes */ 536 psLPC_Q14_tmp = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_tmp ); 537 psLPC_Q14_tmp = _mm_srli_epi64( psLPC_Q14_tmp, 16 ); 538 tmpb = _mm_add_epi32( tmpb, psLPC_Q14_tmp ); 539 540 /* add at last */ 541 /* equal shift right 8 bytes*/ 542 tmpa = _mm_shuffle_epi32( tmpb, _MM_SHUFFLE( 0, 0, 3, 2 ) ); 543 tmpb = _mm_add_epi32( tmpb, tmpa ); 544 LPC_pred_Q14 += _mm_cvtsi128_si32( tmpb ); 545 } 546 else 547 { 548 /* add at last */ 549 tmpa = _mm_shuffle_epi32( tmpb, _MM_SHUFFLE( 0, 0, 3, 2 ) ); /* equal shift right 8 bytes*/ 550 tmpb = _mm_add_epi32( tmpb, tmpa ); 551 LPC_pred_Q14 += _mm_cvtsi128_si32( tmpb ); 552 553 LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ -8 ], a_Q12[ 8 ] ); 554 LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ -9 ], a_Q12[ 9 ] ); 555 } 556 557 LPC_pred_Q14 = silk_LSHIFT( LPC_pred_Q14, 4 ); /* Q10 -> Q14 */ 558 559 /* Noise shape feedback */ 560 celt_assert( ( shapingLPCOrder & 1 ) == 0 ); /* check that order is even */ 561 /* Output of lowpass section */ 562 tmp2 = silk_SMLAWB( psDD->Diff_Q14, psDD->sAR2_Q14[ 0 ], warping_Q16 ); 563 /* Output of allpass section */ 564 tmp1 = silk_SMLAWB( psDD->sAR2_Q14[ 0 ], silk_SUB32_ovflw(psDD->sAR2_Q14[ 1 ], tmp2), warping_Q16 ); 565 psDD->sAR2_Q14[ 0 ] = tmp2; 566 n_AR_Q14 = silk_RSHIFT( shapingLPCOrder, 1 ); 567 n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp2, AR_shp_Q13[ 0 ] ); 568 /* Loop over allpass sections */ 569 for( j = 2; j < shapingLPCOrder; j += 2 ) { 570 /* Output of allpass section */ 571 tmp2 = silk_SMLAWB( psDD->sAR2_Q14[ j - 1 ], silk_SUB32_ovflw(psDD->sAR2_Q14[ j + 0 ], tmp1), warping_Q16 ); 572 psDD->sAR2_Q14[ j - 1 ] = tmp1; 573 n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp1, AR_shp_Q13[ j - 1 ] ); 574 /* Output of allpass section */ 575 tmp1 = silk_SMLAWB( psDD->sAR2_Q14[ j + 0 ], silk_SUB32_ovflw(psDD->sAR2_Q14[ j + 1 ], tmp2), warping_Q16 ); 576 psDD->sAR2_Q14[ j + 0 ] = tmp2; 577 n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp2, AR_shp_Q13[ j ] ); 578 } 579 psDD->sAR2_Q14[ shapingLPCOrder - 1 ] = tmp1; 580 n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp1, AR_shp_Q13[ shapingLPCOrder - 1 ] ); 581 582 n_AR_Q14 = silk_LSHIFT( n_AR_Q14, 1 ); /* Q11 -> Q12 */ 583 n_AR_Q14 = silk_SMLAWB( n_AR_Q14, psDD->LF_AR_Q14, Tilt_Q14 ); /* Q12 */ 584 n_AR_Q14 = silk_LSHIFT( n_AR_Q14, 2 ); /* Q12 -> Q14 */ 585 586 n_LF_Q14 = silk_SMULWB( psDD->Shape_Q14[ *smpl_buf_idx ], LF_shp_Q14 ); /* Q12 */ 587 n_LF_Q14 = silk_SMLAWT( n_LF_Q14, psDD->LF_AR_Q14, LF_shp_Q14 ); /* Q12 */ 588 n_LF_Q14 = silk_LSHIFT( n_LF_Q14, 2 ); /* Q12 -> Q14 */ 589 590 /* Input minus prediction plus noise feedback */ 591 /* r = x[ i ] - LTP_pred - LPC_pred + n_AR + n_Tilt + n_LF + n_LTP */ 592 tmp1 = silk_ADD_SAT32( n_AR_Q14, n_LF_Q14 ); /* Q14 */ 593 tmp2 = silk_ADD32_ovflw( n_LTP_Q14, LPC_pred_Q14 ); /* Q13 */ 594 tmp1 = silk_SUB_SAT32( tmp2, tmp1 ); /* Q13 */ 595 tmp1 = silk_RSHIFT_ROUND( tmp1, 4 ); /* Q10 */ 596 597 r_Q10 = silk_SUB32( x_Q10[ i ], tmp1 ); /* residual error Q10 */ 598 599 /* Flip sign depending on dither */ 600 if ( psDD->Seed < 0 ) { 601 r_Q10 = -r_Q10; 602 } 603 r_Q10 = silk_LIMIT_32( r_Q10, -(31 << 10), 30 << 10 ); 604 605 /* Find two quantization level candidates and measure their rate-distortion */ 606 q1_Q10 = silk_SUB32( r_Q10, offset_Q10 ); 607 q1_Q0 = silk_RSHIFT( q1_Q10, 10 ); 608 if (Lambda_Q10 > 2048) { 609 /* For aggressive RDO, the bias becomes more than one pulse. */ 610 if (q1_Q10 > rdo_offset) { 611 q1_Q0 = silk_RSHIFT( q1_Q10 - rdo_offset, 10 ); 612 } else if (q1_Q10 < -rdo_offset) { 613 q1_Q0 = silk_RSHIFT( q1_Q10 + rdo_offset, 10 ); 614 } else if (q1_Q10 < 0) { 615 q1_Q0 = -1; 616 } else { 617 q1_Q0 = 0; 618 } 619 } 620 if( q1_Q0 > 0 ) { 621 q1_Q10 = silk_SUB32( silk_LSHIFT( q1_Q0, 10 ), QUANT_LEVEL_ADJUST_Q10 ); 622 q1_Q10 = silk_ADD32( q1_Q10, offset_Q10 ); 623 q2_Q10 = silk_ADD32( q1_Q10, 1024 ); 624 rd1_Q10 = silk_SMULBB( q1_Q10, Lambda_Q10 ); 625 rd2_Q10 = silk_SMULBB( q2_Q10, Lambda_Q10 ); 626 } else if( q1_Q0 == 0 ) { 627 q1_Q10 = offset_Q10; 628 q2_Q10 = silk_ADD32( q1_Q10, 1024 - QUANT_LEVEL_ADJUST_Q10 ); 629 rd1_Q10 = silk_SMULBB( q1_Q10, Lambda_Q10 ); 630 rd2_Q10 = silk_SMULBB( q2_Q10, Lambda_Q10 ); 631 } else if( q1_Q0 == -1 ) { 632 q2_Q10 = offset_Q10; 633 q1_Q10 = silk_SUB32( q2_Q10, 1024 - QUANT_LEVEL_ADJUST_Q10 ); 634 rd1_Q10 = silk_SMULBB( -q1_Q10, Lambda_Q10 ); 635 rd2_Q10 = silk_SMULBB( q2_Q10, Lambda_Q10 ); 636 } else { /* q1_Q0 < -1 */ 637 q1_Q10 = silk_ADD32( silk_LSHIFT( q1_Q0, 10 ), QUANT_LEVEL_ADJUST_Q10 ); 638 q1_Q10 = silk_ADD32( q1_Q10, offset_Q10 ); 639 q2_Q10 = silk_ADD32( q1_Q10, 1024 ); 640 rd1_Q10 = silk_SMULBB( -q1_Q10, Lambda_Q10 ); 641 rd2_Q10 = silk_SMULBB( -q2_Q10, Lambda_Q10 ); 642 } 643 rr_Q10 = silk_SUB32( r_Q10, q1_Q10 ); 644 rd1_Q10 = silk_RSHIFT( silk_SMLABB( rd1_Q10, rr_Q10, rr_Q10 ), 10 ); 645 rr_Q10 = silk_SUB32( r_Q10, q2_Q10 ); 646 rd2_Q10 = silk_RSHIFT( silk_SMLABB( rd2_Q10, rr_Q10, rr_Q10 ), 10 ); 647 648 if( rd1_Q10 < rd2_Q10 ) { 649 psSS[ 0 ].RD_Q10 = silk_ADD32( psDD->RD_Q10, rd1_Q10 ); 650 psSS[ 1 ].RD_Q10 = silk_ADD32( psDD->RD_Q10, rd2_Q10 ); 651 psSS[ 0 ].Q_Q10 = q1_Q10; 652 psSS[ 1 ].Q_Q10 = q2_Q10; 653 } else { 654 psSS[ 0 ].RD_Q10 = silk_ADD32( psDD->RD_Q10, rd2_Q10 ); 655 psSS[ 1 ].RD_Q10 = silk_ADD32( psDD->RD_Q10, rd1_Q10 ); 656 psSS[ 0 ].Q_Q10 = q2_Q10; 657 psSS[ 1 ].Q_Q10 = q1_Q10; 658 } 659 660 /* Update states for best quantization */ 661 662 /* Quantized excitation */ 663 exc_Q14 = silk_LSHIFT32( psSS[ 0 ].Q_Q10, 4 ); 664 if ( psDD->Seed < 0 ) { 665 exc_Q14 = -exc_Q14; 666 } 667 668 /* Add predictions */ 669 LPC_exc_Q14 = silk_ADD32( exc_Q14, LTP_pred_Q14 ); 670 xq_Q14 = silk_ADD32_ovflw( LPC_exc_Q14, LPC_pred_Q14 ); 671 672 /* Update states */ 673 psSS[ 0 ].Diff_Q14 = silk_SUB32_ovflw( xq_Q14, silk_LSHIFT32( x_Q10[ i ], 4 ) ); 674 sLF_AR_shp_Q14 = silk_SUB32_ovflw( psSS[ 0 ].Diff_Q14, n_AR_Q14 ); 675 psSS[ 0 ].sLTP_shp_Q14 = silk_SUB_SAT32( sLF_AR_shp_Q14, n_LF_Q14 ); 676 psSS[ 0 ].LF_AR_Q14 = sLF_AR_shp_Q14; 677 psSS[ 0 ].LPC_exc_Q14 = LPC_exc_Q14; 678 psSS[ 0 ].xq_Q14 = xq_Q14; 679 680 /* Update states for second best quantization */ 681 682 /* Quantized excitation */ 683 exc_Q14 = silk_LSHIFT32( psSS[ 1 ].Q_Q10, 4 ); 684 if ( psDD->Seed < 0 ) { 685 exc_Q14 = -exc_Q14; 686 } 687 688 /* Add predictions */ 689 LPC_exc_Q14 = silk_ADD32( exc_Q14, LTP_pred_Q14 ); 690 xq_Q14 = silk_ADD32_ovflw( LPC_exc_Q14, LPC_pred_Q14 ); 691 692 /* Update states */ 693 psSS[ 1 ].Diff_Q14 = silk_SUB32_ovflw( xq_Q14, silk_LSHIFT32( x_Q10[ i ], 4 ) ); 694 sLF_AR_shp_Q14 = silk_SUB32_ovflw( psSS[ 1 ].Diff_Q14, n_AR_Q14 ); 695 psSS[ 1 ].sLTP_shp_Q14 = silk_SUB_SAT32( sLF_AR_shp_Q14, n_LF_Q14 ); 696 psSS[ 1 ].LF_AR_Q14 = sLF_AR_shp_Q14; 697 psSS[ 1 ].LPC_exc_Q14 = LPC_exc_Q14; 698 psSS[ 1 ].xq_Q14 = xq_Q14; 699 } 700 } 701 *smpl_buf_idx = ( *smpl_buf_idx - 1 ) % DECISION_DELAY; 702 if( *smpl_buf_idx < 0 ) *smpl_buf_idx += DECISION_DELAY; 703 last_smple_idx = ( *smpl_buf_idx + decisionDelay ) % DECISION_DELAY; 704 705 /* Find winner */ 706 RDmin_Q10 = psSampleState[ 0 ][ 0 ].RD_Q10; 707 Winner_ind = 0; 708 for( k = 1; k < nStatesDelayedDecision; k++ ) { 709 if( psSampleState[ k ][ 0 ].RD_Q10 < RDmin_Q10 ) { 710 RDmin_Q10 = psSampleState[ k ][ 0 ].RD_Q10; 711 Winner_ind = k; 712 } 713 } 714 715 /* Increase RD values of expired states */ 716 Winner_rand_state = psDelDec[ Winner_ind ].RandState[ last_smple_idx ]; 717 for( k = 0; k < nStatesDelayedDecision; k++ ) { 718 if( psDelDec[ k ].RandState[ last_smple_idx ] != Winner_rand_state ) { 719 psSampleState[ k ][ 0 ].RD_Q10 = silk_ADD32( psSampleState[ k ][ 0 ].RD_Q10, silk_int32_MAX >> 4 ); 720 psSampleState[ k ][ 1 ].RD_Q10 = silk_ADD32( psSampleState[ k ][ 1 ].RD_Q10, silk_int32_MAX >> 4 ); 721 silk_assert( psSampleState[ k ][ 0 ].RD_Q10 >= 0 ); 722 } 723 } 724 725 /* Find worst in first set and best in second set */ 726 RDmax_Q10 = psSampleState[ 0 ][ 0 ].RD_Q10; 727 RDmin_Q10 = psSampleState[ 0 ][ 1 ].RD_Q10; 728 RDmax_ind = 0; 729 RDmin_ind = 0; 730 for( k = 1; k < nStatesDelayedDecision; k++ ) { 731 /* find worst in first set */ 732 if( psSampleState[ k ][ 0 ].RD_Q10 > RDmax_Q10 ) { 733 RDmax_Q10 = psSampleState[ k ][ 0 ].RD_Q10; 734 RDmax_ind = k; 735 } 736 /* find best in second set */ 737 if( psSampleState[ k ][ 1 ].RD_Q10 < RDmin_Q10 ) { 738 RDmin_Q10 = psSampleState[ k ][ 1 ].RD_Q10; 739 RDmin_ind = k; 740 } 741 } 742 743 /* Replace a state if best from second set outperforms worst in first set */ 744 if( RDmin_Q10 < RDmax_Q10 ) { 745 silk_memcpy( ( (opus_int32 *)&psDelDec[ RDmax_ind ] ) + i, 746 ( (opus_int32 *)&psDelDec[ RDmin_ind ] ) + i, sizeof( NSQ_del_dec_struct ) - i * sizeof( opus_int32) ); 747 silk_memcpy( &psSampleState[ RDmax_ind ][ 0 ], &psSampleState[ RDmin_ind ][ 1 ], sizeof( NSQ_sample_struct ) ); 748 } 749 750 /* Write samples from winner to output and long-term filter states */ 751 psDD = &psDelDec[ Winner_ind ]; 752 if( subfr > 0 || i >= decisionDelay ) { 753 pulses[ i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDD->Q_Q10[ last_smple_idx ], 10 ); 754 xq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( 755 silk_SMULWW( psDD->Xq_Q14[ last_smple_idx ], delayedGain_Q10[ last_smple_idx ] ), 8 ) ); 756 NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay ] = psDD->Shape_Q14[ last_smple_idx ]; 757 sLTP_Q15[ NSQ->sLTP_buf_idx - decisionDelay ] = psDD->Pred_Q15[ last_smple_idx ]; 758 } 759 NSQ->sLTP_shp_buf_idx++; 760 NSQ->sLTP_buf_idx++; 761 762 /* Update states */ 763 for( k = 0; k < nStatesDelayedDecision; k++ ) { 764 psDD = &psDelDec[ k ]; 765 psSS = &psSampleState[ k ][ 0 ]; 766 psDD->LF_AR_Q14 = psSS->LF_AR_Q14; 767 psDD->Diff_Q14 = psSS->Diff_Q14; 768 psDD->sLPC_Q14[ NSQ_LPC_BUF_LENGTH + i ] = psSS->xq_Q14; 769 psDD->Xq_Q14[ *smpl_buf_idx ] = psSS->xq_Q14; 770 psDD->Q_Q10[ *smpl_buf_idx ] = psSS->Q_Q10; 771 psDD->Pred_Q15[ *smpl_buf_idx ] = silk_LSHIFT32( psSS->LPC_exc_Q14, 1 ); 772 psDD->Shape_Q14[ *smpl_buf_idx ] = psSS->sLTP_shp_Q14; 773 psDD->Seed = silk_ADD32_ovflw( psDD->Seed, silk_RSHIFT_ROUND( psSS->Q_Q10, 10 ) ); 774 psDD->RandState[ *smpl_buf_idx ] = psDD->Seed; 775 psDD->RD_Q10 = psSS->RD_Q10; 776 } 777 delayedGain_Q10[ *smpl_buf_idx ] = Gain_Q10; 778 } 779 /* Update LPC states */ 780 for( k = 0; k < nStatesDelayedDecision; k++ ) { 781 psDD = &psDelDec[ k ]; 782 silk_memcpy( psDD->sLPC_Q14, &psDD->sLPC_Q14[ length ], NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) ); 783 } 784 RESTORE_STACK; 785 } 786 787 static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1( 788 const silk_encoder_state *psEncC, /* I Encoder State */ 789 silk_nsq_state *NSQ, /* I/O NSQ state */ 790 NSQ_del_dec_struct psDelDec[], /* I/O Delayed decision states */ 791 const opus_int16 x16[], /* I Input */ 792 opus_int32 x_sc_Q10[], /* O Input scaled with 1/Gain in Q10 */ 793 const opus_int16 sLTP[], /* I Re-whitened LTP state in Q0 */ 794 opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */ 795 opus_int subfr, /* I Subframe number */ 796 opus_int nStatesDelayedDecision, /* I Number of del dec states */ 797 const opus_int LTP_scale_Q14, /* I LTP state scaling */ 798 const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I */ 799 const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lag */ 800 const opus_int signal_type, /* I Signal type */ 801 const opus_int decisionDelay /* I Decision delay */ 802 ) 803 { 804 opus_int i, k, lag; 805 opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q26; 806 NSQ_del_dec_struct *psDD; 807 __m128i xmm_inv_gain_Q26, xmm_x16_x2x0, xmm_x16_x3x1; 808 809 lag = pitchL[ subfr ]; 810 inv_gain_Q31 = silk_INVERSE32_varQ( silk_max( Gains_Q16[ subfr ], 1 ), 47 ); 811 silk_assert( inv_gain_Q31 != 0 ); 812 813 /* Scale input */ 814 inv_gain_Q26 = silk_RSHIFT_ROUND( inv_gain_Q31, 5 ); 815 816 /* prepare inv_gain_Q26 in packed 4 32-bits */ 817 xmm_inv_gain_Q26 = _mm_set1_epi32(inv_gain_Q26); 818 819 for( i = 0; i < psEncC->subfr_length - 3; i += 4 ) { 820 xmm_x16_x2x0 = OP_CVTEPI16_EPI32_M64( &(x16[ i ] ) ); 821 822 /* equal shift right 4 bytes*/ 823 xmm_x16_x3x1 = _mm_shuffle_epi32( xmm_x16_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) ); 824 825 xmm_x16_x2x0 = _mm_mul_epi32( xmm_x16_x2x0, xmm_inv_gain_Q26 ); 826 xmm_x16_x3x1 = _mm_mul_epi32( xmm_x16_x3x1, xmm_inv_gain_Q26 ); 827 828 xmm_x16_x2x0 = _mm_srli_epi64( xmm_x16_x2x0, 16 ); 829 xmm_x16_x3x1 = _mm_slli_epi64( xmm_x16_x3x1, 16 ); 830 831 xmm_x16_x2x0 = _mm_blend_epi16( xmm_x16_x2x0, xmm_x16_x3x1, 0xCC ); 832 833 _mm_storeu_si128( (__m128i *)(void*)(&(x_sc_Q10[ i ] ) ), xmm_x16_x2x0 ); 834 } 835 836 for( ; i < psEncC->subfr_length; i++ ) { 837 x_sc_Q10[ i ] = silk_SMULWW( x16[ i ], inv_gain_Q26 ); 838 } 839 840 /* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */ 841 if( NSQ->rewhite_flag ) { 842 if( subfr == 0 ) { 843 /* Do LTP downscaling */ 844 inv_gain_Q31 = silk_LSHIFT( silk_SMULWB( inv_gain_Q31, LTP_scale_Q14 ), 2 ); 845 } 846 for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx; i++ ) { 847 silk_assert( i < MAX_FRAME_LENGTH ); 848 sLTP_Q15[ i ] = silk_SMULWB( inv_gain_Q31, sLTP[ i ] ); 849 } 850 } 851 852 /* Adjust for changing gain */ 853 if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) { 854 gain_adj_Q16 = silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 ); 855 856 /* Scale long-term shaping state */ 857 { 858 __m128i xmm_gain_adj_Q16, xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1; 859 860 /* prepare gain_adj_Q16 in packed 4 32-bits */ 861 xmm_gain_adj_Q16 = _mm_set1_epi32( gain_adj_Q16 ); 862 863 for( i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sLTP_shp_buf_idx - 3; i += 4 ) 864 { 865 xmm_sLTP_shp_Q14_x2x0 = _mm_loadu_si128( (__m128i *)(void*)(&(NSQ->sLTP_shp_Q14[ i ] ) ) ); 866 /* equal shift right 4 bytes*/ 867 xmm_sLTP_shp_Q14_x3x1 = _mm_shuffle_epi32( xmm_sLTP_shp_Q14_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) ); 868 869 xmm_sLTP_shp_Q14_x2x0 = _mm_mul_epi32( xmm_sLTP_shp_Q14_x2x0, xmm_gain_adj_Q16 ); 870 xmm_sLTP_shp_Q14_x3x1 = _mm_mul_epi32( xmm_sLTP_shp_Q14_x3x1, xmm_gain_adj_Q16 ); 871 872 xmm_sLTP_shp_Q14_x2x0 = _mm_srli_epi64( xmm_sLTP_shp_Q14_x2x0, 16 ); 873 xmm_sLTP_shp_Q14_x3x1 = _mm_slli_epi64( xmm_sLTP_shp_Q14_x3x1, 16 ); 874 875 xmm_sLTP_shp_Q14_x2x0 = _mm_blend_epi16( xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1, 0xCC ); 876 877 _mm_storeu_si128( (__m128i *)(void*)(&(NSQ->sLTP_shp_Q14[ i ] ) ), xmm_sLTP_shp_Q14_x2x0 ); 878 } 879 880 for( ; i < NSQ->sLTP_shp_buf_idx; i++ ) { 881 NSQ->sLTP_shp_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sLTP_shp_Q14[ i ] ); 882 } 883 884 /* Scale long-term prediction state */ 885 if( signal_type == TYPE_VOICED && NSQ->rewhite_flag == 0 ) { 886 for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx - decisionDelay; i++ ) { 887 sLTP_Q15[ i ] = silk_SMULWW( gain_adj_Q16, sLTP_Q15[ i ] ); 888 } 889 } 890 891 for( k = 0; k < nStatesDelayedDecision; k++ ) { 892 psDD = &psDelDec[ k ]; 893 894 /* Scale scalar states */ 895 psDD->LF_AR_Q14 = silk_SMULWW( gain_adj_Q16, psDD->LF_AR_Q14 ); 896 psDD->Diff_Q14 = silk_SMULWW( gain_adj_Q16, psDD->Diff_Q14 ); 897 898 /* Scale short-term prediction and shaping states */ 899 for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) { 900 psDD->sLPC_Q14[ i ] = silk_SMULWW( gain_adj_Q16, psDD->sLPC_Q14[ i ] ); 901 } 902 for( i = 0; i < MAX_SHAPE_LPC_ORDER; i++ ) { 903 psDD->sAR2_Q14[ i ] = silk_SMULWW( gain_adj_Q16, psDD->sAR2_Q14[ i ] ); 904 } 905 for( i = 0; i < DECISION_DELAY; i++ ) { 906 psDD->Pred_Q15[ i ] = silk_SMULWW( gain_adj_Q16, psDD->Pred_Q15[ i ] ); 907 psDD->Shape_Q14[ i ] = silk_SMULWW( gain_adj_Q16, psDD->Shape_Q14[ i ] ); 908 } 909 } 910 } 911 912 /* Save inverse gain */ 913 NSQ->prev_gain_Q16 = Gains_Q16[ subfr ]; 914 } 915 }