NSQ_del_dec_avx2.c (52645B)
1 /*********************************************************************** 2 Copyright (c) 2021 Google Inc. 3 Redistribution and use in source and binary forms, with or without 4 modification, are permitted provided that the following conditions 5 are met: 6 - Redistributions of source code must retain the above copyright notice, 7 this list of conditions and the following disclaimer. 8 - Redistributions in binary form must reproduce the above copyright 9 notice, this list of conditions and the following disclaimer in the 10 documentation and/or other materials provided with the distribution. 11 - Neither the name of Internet Society, IETF or IETF Trust, nor the 12 names of specific contributors, may be used to endorse or promote 13 products derived from this software without specific prior written 14 permission. 15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 19 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 20 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 21 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 22 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 23 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 24 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 25 POSSIBILITY OF SUCH DAMAGE. 26 ***********************************************************************/ 27 28 #ifdef HAVE_CONFIG_H 29 #include "config.h" 30 #endif 31 32 #ifdef OPUS_CHECK_ASM 33 #include <string.h> 34 #endif 35 36 #include "opus_defines.h" 37 #include <immintrin.h> 38 39 #include "main.h" 40 #include "stack_alloc.h" 41 #include "NSQ.h" 42 #include "celt/x86/x86cpu.h" 43 44 /* Returns TRUE if all assumptions met */ 45 static OPUS_INLINE int verify_assumptions(const silk_encoder_state *psEncC) 46 { 47 /* This optimization is based on these assumptions */ 48 /* These assumptions are fundamental and hence assert are */ 49 /* used. Should any assert triggers, we have to re-visit */ 50 /* all related code to make sure it still functions the */ 51 /* same as the C implementation. */ 52 silk_assert(MAX_DEL_DEC_STATES <= 4 && 53 MAX_FRAME_LENGTH % 4 == 0 && 54 MAX_SUB_FRAME_LENGTH % 4 == 0 && 55 LTP_MEM_LENGTH_MS % 4 == 0 ); 56 silk_assert(psEncC->fs_kHz == 8 || 57 psEncC->fs_kHz == 12 || 58 psEncC->fs_kHz == 16 ); 59 silk_assert(psEncC->nb_subfr <= MAX_NB_SUBFR && 60 psEncC->nb_subfr > 0 ); 61 silk_assert(psEncC->nStatesDelayedDecision <= MAX_DEL_DEC_STATES && 62 psEncC->nStatesDelayedDecision > 0 ); 63 silk_assert(psEncC->ltp_mem_length == psEncC->fs_kHz * LTP_MEM_LENGTH_MS); 64 65 /* Regressions were observed on certain AMD Zen CPUs when */ 66 /* nStatesDelayedDecision is 1 or 2. Ideally we should detect */ 67 /* these CPUs and enable this optimization on others; however, */ 68 /* there is no good way to do so under current OPUS framework. */ 69 return psEncC->nStatesDelayedDecision == 3 || 70 psEncC->nStatesDelayedDecision == 4; 71 } 72 73 /* Intrinsics not defined on MSVC */ 74 #ifdef _MSC_VER 75 #include <intsafe.h> 76 static inline int __builtin_sadd_overflow(opus_int32 a, opus_int32 b, opus_int32* res) 77 { 78 *res = a+b; 79 return (*res ^ a) & (*res ^ b) & 0x80000000; 80 } 81 static inline int __builtin_ctz(unsigned int x) 82 { 83 DWORD res = 0; 84 return _BitScanForward(&res, x) ? res : 32; 85 } 86 #endif 87 88 static OPUS_INLINE __m128i silk_cvtepi64_epi32_high(__m256i num) 89 { 90 return _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(num, _mm256_set_epi32(0, 0, 0, 0, 7, 5, 3, 1))); 91 } 92 93 static OPUS_INLINE opus_int16 silk_sat16(opus_int32 num) 94 { 95 num = num > silk_int16_MAX ? silk_int16_MAX : num; 96 num = num < silk_int16_MIN ? silk_int16_MIN : num; 97 return num; 98 } 99 100 static OPUS_INLINE opus_int32 silk_sar_round_32(opus_int32 a, int bits) 101 { 102 silk_assert(bits > 0 && bits < 31); 103 a += 1 << (bits-1); 104 return a >> bits; 105 } 106 107 static OPUS_INLINE opus_int64 silk_sar_round_smulww(opus_int32 a, opus_int32 b, int bits) 108 { 109 #ifndef OPUS_CHECK_ASM 110 opus_int64 t; 111 #endif 112 silk_assert(bits > 0 && bits < 63); 113 #ifdef OPUS_CHECK_ASM 114 return silk_RSHIFT_ROUND(silk_SMULWW(a, b), bits); 115 #else 116 /* This code is more correct, but it won't overflow like the C code in some rare cases. */ 117 silk_assert(bits > 0 && bits < 63); 118 t = ((opus_int64)a) * ((opus_int64)b); 119 bits += 16; 120 t += 1ull << (bits-1); 121 return t >> bits; 122 #endif 123 } 124 125 static OPUS_INLINE opus_int32 silk_add_sat32(opus_int32 a, opus_int32 b) 126 { 127 opus_int32 sum; 128 if (__builtin_sadd_overflow(a, b, &sum)) 129 { 130 return a >= 0 ? silk_int32_MAX : silk_int32_MIN; 131 } 132 return sum; 133 } 134 135 static OPUS_INLINE __m128i silk_mm_srai_round_epi32(__m128i a, int bits) 136 { 137 silk_assert(bits > 0 && bits < 31); 138 return _mm_srai_epi32(_mm_add_epi32(a, _mm_set1_epi32(1 << (bits - 1))), bits); 139 } 140 141 /* add/subtract with output saturated */ 142 static OPUS_INLINE __m128i silk_mm_add_sat_epi32(__m128i a, __m128i b) 143 { 144 __m128i r = _mm_add_epi32(a, b); 145 __m128i OF = _mm_and_si128(_mm_xor_si128(a, r), _mm_xor_si128(b, r)); /* OF = (sum ^ a) & (sum ^ b) */ 146 __m128i SAT = _mm_add_epi32(_mm_srli_epi32(a, 31), _mm_set1_epi32(0x7FFFFFFF)); /* SAT = (a >> 31) + 0x7FFFFFFF */ 147 return _mm_blendv_epi8(r, SAT, _mm_srai_epi32(OF, 31)); 148 } 149 static OPUS_INLINE __m128i silk_mm_sub_sat_epi32(__m128i a, __m128i b) 150 { 151 __m128i r = _mm_sub_epi32(a, b); 152 __m128i OF = _mm_andnot_si128(_mm_xor_si128(b, r), _mm_xor_si128(a, r)); /* OF = (sum ^ a) & (sum ^ ~b) = (sum ^ a) & ~(sum ^ b) */ 153 __m128i SAT = _mm_add_epi32(_mm_srli_epi32(a, 31), _mm_set1_epi32(0x7FFFFFFF)); /* SAT = (a >> 31) + 0x7FFFFFFF */ 154 return _mm_blendv_epi8(r, SAT, _mm_srai_epi32(OF, 31)); 155 } 156 static OPUS_INLINE __m256i silk_mm256_sub_sat_epi32(__m256i a, __m256i b) 157 { 158 __m256i r = _mm256_sub_epi32(a, b); 159 __m256i OF = _mm256_andnot_si256(_mm256_xor_si256(b, r), _mm256_xor_si256(a, r)); /* OF = (sum ^ a) & (sum ^ ~b) = (sum ^ a) & ~(sum ^ b) */ 160 __m256i SAT = _mm256_add_epi32(_mm256_srli_epi32(a, 31), _mm256_set1_epi32(0x7FFFFFFF)); /* SAT = (a >> 31) + 0x7FFFFFFF */ 161 return _mm256_blendv_epi8(r, SAT, _mm256_srai_epi32(OF, 31)); 162 } 163 164 static OPUS_INLINE __m128i silk_mm_limit_epi32(__m128i num, opus_int32 limit1, opus_int32 limit2) 165 { 166 opus_int32 lo = limit1 < limit2 ? limit1 : limit2; 167 opus_int32 hi = limit1 > limit2 ? limit1 : limit2; 168 169 num = _mm_min_epi32(num, _mm_set1_epi32(hi)); 170 num = _mm_max_epi32(num, _mm_set1_epi32(lo)); 171 return num; 172 } 173 174 /* cond < 0 ? -num : num */ 175 static OPUS_INLINE __m128i silk_mm_sign_epi32(__m128i num, __m128i cond) 176 { 177 return _mm_sign_epi32(num, _mm_or_si128(cond, _mm_set1_epi32(1))); 178 } 179 static OPUS_INLINE __m256i silk_mm256_sign_epi32(__m256i num, __m256i cond) 180 { 181 return _mm256_sign_epi32(num, _mm256_or_si256(cond, _mm256_set1_epi32(1))); 182 } 183 184 /* (a32 * b32) >> 16 */ 185 static OPUS_INLINE __m128i silk_mm_smulww_epi32(__m128i a, opus_int32 b) 186 { 187 return silk_cvtepi64_epi32_high(_mm256_slli_epi64(_mm256_mul_epi32(_mm256_cvtepi32_epi64(a), _mm256_set1_epi32(b)), 16)); 188 } 189 190 /* (a32 * (opus_int32)((opus_int16)(b32))) >> 16 output have to be 32bit int */ 191 static OPUS_INLINE __m128i silk_mm_smulwb_epi32(__m128i a, opus_int32 b) 192 { 193 return silk_cvtepi64_epi32_high(_mm256_mul_epi32(_mm256_cvtepi32_epi64(a), _mm256_set1_epi32((opus_uint32)b<<16))); 194 } 195 196 /* (opus_int32)((opus_int16)(a3))) * (opus_int32)((opus_int16)(b32)) output have to be 32bit int */ 197 static OPUS_INLINE __m256i silk_mm256_smulbb_epi32(__m256i a, __m256i b) 198 { 199 const char FF = (char)0xFF; 200 __m256i msk = _mm256_set_epi8( 201 FF, FF, FF, FF, FF, FF, FF, FF, 13, 12, 9, 8, 5, 4, 1, 0, 202 FF, FF, FF, FF, FF, FF, FF, FF, 13, 12, 9, 8, 5, 4, 1, 0); 203 __m256i lo = _mm256_mullo_epi16(a, b); 204 __m256i hi = _mm256_mulhi_epi16(a, b); 205 lo = _mm256_shuffle_epi8(lo, msk); 206 hi = _mm256_shuffle_epi8(hi, msk); 207 return _mm256_unpacklo_epi16(lo, hi); 208 } 209 210 static OPUS_INLINE __m256i silk_mm256_reverse_epi32(__m256i v) 211 { 212 v = _mm256_shuffle_epi32(v, 0x1B); 213 v = _mm256_permute4x64_epi64(v, 0x4E); 214 return v; 215 } 216 217 static OPUS_INLINE opus_int32 silk_mm256_hsum_epi32(__m256i v) 218 { 219 __m128i sum = _mm_add_epi32(_mm256_extracti128_si256(v, 1), _mm256_extracti128_si256(v, 0)); 220 sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4E)); 221 sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1)); 222 return _mm_cvtsi128_si32(sum); 223 } 224 225 static OPUS_INLINE __m128i silk_mm_hmin_epi32(__m128i num) 226 { 227 num = _mm_min_epi32(num, _mm_shuffle_epi32(num, 0x4E)); /* 0123 -> 2301 */ 228 num = _mm_min_epi32(num, _mm_shuffle_epi32(num, 0xB1)); /* 0123 -> 1032 */ 229 return num; 230 } 231 232 static OPUS_INLINE __m128i silk_mm_hmax_epi32(__m128i num) 233 { 234 num = _mm_max_epi32(num, _mm_shuffle_epi32(num, 0x4E)); /* 0123 -> 2310 */ 235 num = _mm_max_epi32(num, _mm_shuffle_epi32(num, 0xB1)); /* 0123 -> 1032 */ 236 return num; 237 } 238 239 static OPUS_INLINE __m128i silk_mm_mask_hmin_epi32(__m128i num, __m128i mask) 240 { 241 num = _mm_blendv_epi8(num, _mm_set1_epi32(silk_int32_MAX), mask); 242 return silk_mm_hmin_epi32(num); 243 } 244 245 static OPUS_INLINE __m128i silk_mm_mask_hmax_epi32(__m128i num, __m128i mask) 246 { 247 num = _mm_blendv_epi8(num, _mm_set1_epi32(silk_int32_MIN), mask); 248 return silk_mm_hmax_epi32(num); 249 } 250 251 static OPUS_INLINE __m128i silk_mm256_rand_epi32(__m128i seed) 252 { 253 seed = _mm_mullo_epi32(seed, _mm_set1_epi32(RAND_MULTIPLIER)); 254 seed = _mm_add_epi32(seed, _mm_set1_epi32(RAND_INCREMENT)); 255 return seed; 256 } 257 258 static OPUS_INLINE opus_int32 silk_index_of_first_equal_epi32(__m128i a, __m128i b) 259 { 260 unsigned int mask = _mm_movemask_epi8(_mm_cmpeq_epi32(a, b)) & 0x1111; 261 silk_assert(mask != 0); 262 return __builtin_ctz(mask) >> 2; 263 } 264 265 static __m128i silk_index_to_selector(opus_int32 index) 266 { 267 silk_assert(index < 4); 268 index <<= 2; 269 return _mm_set_epi8( 270 index + 3, index + 2, index + 1, index + 0, 271 index + 3, index + 2, index + 1, index + 0, 272 index + 3, index + 2, index + 1, index + 0, 273 index + 3, index + 2, index + 1, index + 0); 274 } 275 276 static opus_int32 silk_select_winner(__m128i num, __m128i selector) 277 { 278 return _mm_cvtsi128_si32(_mm_shuffle_epi8(num, selector)); 279 } 280 281 typedef struct 282 { 283 __m128i RandState; 284 __m128i Q_Q10; 285 __m128i Xq_Q14; 286 __m128i Pred_Q15; 287 __m128i Shape_Q14; 288 } NSQ_del_dec_sample_struct; 289 290 typedef struct 291 { 292 __m128i sLPC_Q14[MAX_SUB_FRAME_LENGTH + NSQ_LPC_BUF_LENGTH]; 293 __m128i LF_AR_Q14; 294 __m128i Seed; 295 __m128i SeedInit; 296 __m128i RD_Q10; 297 __m128i Diff_Q14; 298 __m128i sAR2_Q14[MAX_SHAPE_LPC_ORDER]; 299 NSQ_del_dec_sample_struct Samples[DECISION_DELAY]; 300 } NSQ_del_dec_struct; 301 302 static OPUS_INLINE void silk_nsq_del_dec_scale_states_avx2( 303 const silk_encoder_state *psEncC, /* I Encoder State */ 304 silk_nsq_state *NSQ, /* I/O NSQ state */ 305 NSQ_del_dec_struct *psDelDec, /* I/O Delayed decision states */ 306 const opus_int16 x16[], /* I Input */ 307 opus_int32 x_sc_Q10[MAX_SUB_FRAME_LENGTH], /* O Input scaled with 1/Gain in Q10 */ 308 const opus_int16 sLTP[], /* I Re-whitened LTP state in Q0 */ 309 opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */ 310 opus_int subfr, /* I Subframe number */ 311 const opus_int LTP_scale_Q14, /* I LTP state scaling */ 312 const opus_int32 Gains_Q16[MAX_NB_SUBFR], /* I */ 313 const opus_int pitchL[MAX_NB_SUBFR], /* I Pitch lag */ 314 const opus_int signal_type, /* I Signal type */ 315 const opus_int decisionDelay /* I Decision delay */ 316 ); 317 318 /*******************************************/ 319 /* LPC analysis filter */ 320 /* NB! State is kept internally and the */ 321 /* filter always starts with zero state */ 322 /* first d output samples are set to zero */ 323 /*******************************************/ 324 static OPUS_INLINE void silk_LPC_analysis_filter_avx2( 325 opus_int16 *out, /* O Output signal */ 326 const opus_int16 *in, /* I Input signal */ 327 const opus_int16 *B, /* I MA prediction coefficients, Q12 [order] */ 328 const opus_int32 len, /* I Signal length */ 329 const opus_int32 order /* I Filter order */ 330 ); 331 332 /******************************************/ 333 /* Noise shape quantizer for one subframe */ 334 /******************************************/ 335 static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_avx2( 336 silk_nsq_state *NSQ, /* I/O NSQ state */ 337 NSQ_del_dec_struct psDelDec[], /* I/O Delayed decision states */ 338 opus_int signalType, /* I Signal type */ 339 const opus_int32 x_Q10[], /* I */ 340 opus_int8 pulses[], /* O */ 341 opus_int16 xq[], /* O */ 342 opus_int32 sLTP_Q15[], /* I/O LTP filter state */ 343 opus_int32 delayedGain_Q10[DECISION_DELAY], /* I/O Gain delay buffer */ 344 const opus_int16 a_Q12[], /* I Short term prediction coefs */ 345 const opus_int16 b_Q14[], /* I Long term prediction coefs */ 346 const opus_int16 AR_shp_Q13[], /* I Noise shaping coefs */ 347 opus_int lag, /* I Pitch lag */ 348 opus_int32 HarmShapeFIRPacked_Q14, /* I */ 349 opus_int Tilt_Q14, /* I Spectral tilt */ 350 opus_int32 LF_shp_Q14, /* I */ 351 opus_int32 Gain_Q16, /* I */ 352 opus_int Lambda_Q10, /* I */ 353 opus_int offset_Q10, /* I */ 354 opus_int length, /* I Input length */ 355 opus_int subfr, /* I Subframe number */ 356 opus_int shapingLPCOrder, /* I Shaping LPC filter order */ 357 opus_int predictLPCOrder, /* I Prediction filter order */ 358 opus_int warping_Q16, /* I */ 359 __m128i MaskDelDec, /* I Mask of states in decision tree */ 360 opus_int *smpl_buf_idx, /* I/O Index to newest samples in buffers */ 361 opus_int decisionDelay /* I */ 362 ); 363 364 void silk_NSQ_del_dec_avx2( 365 const silk_encoder_state *psEncC, /* I Encoder State */ 366 silk_nsq_state *NSQ, /* I/O NSQ state */ 367 SideInfoIndices *psIndices, /* I/O Quantization Indices */ 368 const opus_int16 x16[], /* I Input */ 369 opus_int8 pulses[], /* O Quantized pulse signal */ 370 const opus_int16 *PredCoef_Q12, /* I Short term prediction coefs */ 371 const opus_int16 LTPCoef_Q14[LTP_ORDER * MAX_NB_SUBFR], /* I Long term prediction coefs */ 372 const opus_int16 AR_Q13[MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER], /* I Noise shaping coefs */ 373 const opus_int HarmShapeGain_Q14[MAX_NB_SUBFR], /* I Long term shaping coefs */ 374 const opus_int Tilt_Q14[MAX_NB_SUBFR], /* I Spectral tilt */ 375 const opus_int32 LF_shp_Q14[MAX_NB_SUBFR], /* I Low frequency shaping coefs */ 376 const opus_int32 Gains_Q16[MAX_NB_SUBFR], /* I Quantization step sizes */ 377 const opus_int32 pitchL[MAX_NB_SUBFR], /* I Pitch lags */ 378 const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */ 379 const opus_int LTP_scale_Q14 /* I LTP state scaling */ 380 ) 381 { 382 #ifdef OPUS_CHECK_ASM 383 silk_nsq_state NSQ_c; 384 SideInfoIndices psIndices_c; 385 opus_int8 pulses_c[MAX_FRAME_LENGTH]; 386 const opus_int8 *const pulses_a = pulses; 387 388 silk_memcpy(&NSQ_c, NSQ, sizeof(NSQ_c)); 389 silk_memcpy(&psIndices_c, psIndices, sizeof(psIndices_c)); 390 silk_memcpy(pulses_c, pulses, sizeof(pulses_c)); 391 silk_NSQ_del_dec_c(psEncC, &NSQ_c, &psIndices_c, x16, pulses_c, PredCoef_Q12, LTPCoef_Q14, AR_Q13, HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, 392 pitchL, Lambda_Q10, LTP_scale_Q14); 393 #endif 394 395 if (!verify_assumptions(psEncC)) 396 { 397 silk_NSQ_del_dec_c(psEncC, NSQ, psIndices, x16, pulses, PredCoef_Q12, LTPCoef_Q14, AR_Q13, HarmShapeGain_Q14, Tilt_Q14, LF_shp_Q14, Gains_Q16, pitchL, Lambda_Q10, LTP_scale_Q14); 398 return; 399 } 400 401 opus_int i, k, lag, start_idx, LSF_interpolation_flag, Winner_ind, subfr; 402 opus_int last_smple_idx, smpl_buf_idx, decisionDelay; 403 const opus_int16 *A_Q12, *B_Q14, *AR_shp_Q13; 404 opus_int16 *pxq; 405 VARDECL(opus_int32, sLTP_Q15); 406 VARDECL(opus_int16, sLTP); 407 opus_int32 HarmShapeFIRPacked_Q14; 408 opus_int offset_Q10; 409 opus_int32 Gain_Q10; 410 opus_int32 x_sc_Q10[MAX_SUB_FRAME_LENGTH]; 411 opus_int32 delayedGain_Q10[DECISION_DELAY]; 412 NSQ_del_dec_struct psDelDec = {0}; 413 NSQ_del_dec_sample_struct *psSample; 414 __m128i RDmin_Q10, MaskDelDec, Winner_selector; 415 SAVE_STACK; 416 417 MaskDelDec = _mm_cvtepi8_epi32(_mm_cvtsi32_si128(0xFFFFFF00ul << ((psEncC->nStatesDelayedDecision - 1) << 3))); 418 419 /* Set unvoiced lag to the previous one, overwrite later for voiced */ 420 lag = NSQ->lagPrev; 421 422 silk_assert(NSQ->prev_gain_Q16 != 0); 423 psDelDec.Seed = _mm_and_si128( 424 _mm_add_epi32(_mm_set_epi32(3, 2, 1, 0), _mm_set1_epi32(psIndices->Seed)), 425 _mm_set1_epi32(3)); 426 psDelDec.SeedInit = psDelDec.Seed; 427 psDelDec.RD_Q10 = _mm_setzero_si128(); 428 psDelDec.LF_AR_Q14 = _mm_set1_epi32(NSQ->sLF_AR_shp_Q14); 429 psDelDec.Diff_Q14 = _mm_set1_epi32(NSQ->sDiff_shp_Q14); 430 psDelDec.Samples[0].Shape_Q14 = _mm_set1_epi32(NSQ->sLTP_shp_Q14[psEncC->ltp_mem_length - 1]); 431 for (i = 0; i < NSQ_LPC_BUF_LENGTH; i++) 432 { 433 psDelDec.sLPC_Q14[i] = _mm_set1_epi32(NSQ->sLPC_Q14[i]); 434 } 435 for (i = 0; i < MAX_SHAPE_LPC_ORDER; i++) 436 { 437 psDelDec.sAR2_Q14[i] = _mm_set1_epi32(NSQ->sAR2_Q14[i]); 438 } 439 440 offset_Q10 = silk_Quantization_Offsets_Q10[psIndices->signalType >> 1][psIndices->quantOffsetType]; 441 smpl_buf_idx = 0; /* index of oldest samples */ 442 443 decisionDelay = silk_min_int(DECISION_DELAY, psEncC->subfr_length); 444 445 /* For voiced frames limit the decision delay to lower than the pitch lag */ 446 if (psIndices->signalType == TYPE_VOICED) 447 { 448 for (k = 0; k < psEncC->nb_subfr; k++) 449 { 450 decisionDelay = silk_min_int(decisionDelay, pitchL[k] - LTP_ORDER / 2 - 1); 451 } 452 } 453 else 454 { 455 if (lag > 0) 456 { 457 decisionDelay = silk_min_int(decisionDelay, lag - LTP_ORDER / 2 - 1); 458 } 459 } 460 461 if (psIndices->NLSFInterpCoef_Q2 == 4) 462 { 463 LSF_interpolation_flag = 0; 464 } 465 else 466 { 467 LSF_interpolation_flag = 1; 468 } 469 470 ALLOC(sLTP_Q15, psEncC->ltp_mem_length + psEncC->frame_length, opus_int32); 471 ALLOC(sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16); 472 /* Set up pointers to start of sub frame */ 473 pxq = &NSQ->xq[psEncC->ltp_mem_length]; 474 NSQ->sLTP_shp_buf_idx = psEncC->ltp_mem_length; 475 NSQ->sLTP_buf_idx = psEncC->ltp_mem_length; 476 subfr = 0; 477 for (k = 0; k < psEncC->nb_subfr; k++) 478 { 479 A_Q12 = &PredCoef_Q12[((k >> 1) | (1 ^ LSF_interpolation_flag)) * MAX_LPC_ORDER]; 480 B_Q14 = <PCoef_Q14[k * LTP_ORDER]; 481 AR_shp_Q13 = &AR_Q13[k * MAX_SHAPE_LPC_ORDER]; 482 483 /* Noise shape parameters */ 484 silk_assert(HarmShapeGain_Q14[k] >= 0); 485 HarmShapeFIRPacked_Q14 = silk_RSHIFT( HarmShapeGain_Q14[ k ], 2 ); 486 HarmShapeFIRPacked_Q14 |= silk_LSHIFT( (opus_int32)silk_RSHIFT( HarmShapeGain_Q14[ k ], 1 ), 16 ); 487 488 NSQ->rewhite_flag = 0; 489 if (psIndices->signalType == TYPE_VOICED) 490 { 491 /* Voiced */ 492 lag = pitchL[k]; 493 494 /* Re-whitening */ 495 if ((k & (3 ^ (LSF_interpolation_flag << 1))) == 0) 496 { 497 if (k == 2) 498 { 499 /* RESET DELAYED DECISIONS */ 500 /* Find winner */ 501 RDmin_Q10 = silk_mm_mask_hmin_epi32(psDelDec.RD_Q10, MaskDelDec); 502 Winner_ind = silk_index_of_first_equal_epi32(RDmin_Q10, psDelDec.RD_Q10); 503 Winner_selector = silk_index_to_selector(Winner_ind); 504 psDelDec.RD_Q10 = _mm_add_epi32( 505 psDelDec.RD_Q10, 506 _mm_blendv_epi8( 507 _mm_set1_epi32(silk_int32_MAX >> 4), 508 _mm_setzero_si128(), 509 _mm_cvtepi8_epi32(_mm_cvtsi32_si128(0xFFU << (unsigned)(Winner_ind << 3))))); 510 511 /* Copy final part of signals from winner state to output and long-term filter states */ 512 last_smple_idx = smpl_buf_idx + decisionDelay; 513 for (i = 0; i < decisionDelay; i++) 514 { 515 last_smple_idx = (last_smple_idx + DECISION_DELAY - 1) % DECISION_DELAY; 516 psSample = &psDelDec.Samples[last_smple_idx]; 517 pulses[i - decisionDelay] = 518 (opus_int8)silk_sar_round_32(silk_select_winner(psSample->Q_Q10, Winner_selector), 10); 519 pxq[i - decisionDelay] = 520 silk_sat16((opus_int32)silk_sar_round_smulww(silk_select_winner(psSample->Xq_Q14, Winner_selector), Gains_Q16[1], 14)); 521 NSQ->sLTP_shp_Q14[NSQ->sLTP_shp_buf_idx - decisionDelay + i] = 522 silk_select_winner(psSample->Shape_Q14, Winner_selector); 523 } 524 525 subfr = 0; 526 } 527 528 /* Rewhiten with new A coefs */ 529 start_idx = psEncC->ltp_mem_length - lag - psEncC->predictLPCOrder - LTP_ORDER / 2; 530 silk_assert(start_idx > 0); 531 532 silk_LPC_analysis_filter_avx2(&sLTP[start_idx], &NSQ->xq[start_idx + k * psEncC->subfr_length], 533 A_Q12, psEncC->ltp_mem_length - start_idx, psEncC->predictLPCOrder); 534 535 NSQ->sLTP_buf_idx = psEncC->ltp_mem_length; 536 NSQ->rewhite_flag = 1; 537 } 538 } 539 540 silk_nsq_del_dec_scale_states_avx2(psEncC, NSQ, &psDelDec, x16, x_sc_Q10, sLTP, sLTP_Q15, k, 541 LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType, decisionDelay); 542 543 silk_noise_shape_quantizer_del_dec_avx2(NSQ, &psDelDec, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15, 544 delayedGain_Q10, A_Q12, B_Q14, AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[k], LF_shp_Q14[k], 545 Gains_Q16[k], Lambda_Q10, offset_Q10, psEncC->subfr_length, subfr++, psEncC->shapingLPCOrder, 546 psEncC->predictLPCOrder, psEncC->warping_Q16, MaskDelDec, &smpl_buf_idx, decisionDelay); 547 548 x16 += psEncC->subfr_length; 549 pulses += psEncC->subfr_length; 550 pxq += psEncC->subfr_length; 551 } 552 553 /* Find winner */ 554 RDmin_Q10 = silk_mm_mask_hmin_epi32(psDelDec.RD_Q10, MaskDelDec); 555 Winner_selector = silk_index_to_selector(silk_index_of_first_equal_epi32(RDmin_Q10, psDelDec.RD_Q10)); 556 557 /* Copy final part of signals from winner state to output and long-term filter states */ 558 psIndices->Seed = silk_select_winner(psDelDec.SeedInit, Winner_selector); 559 last_smple_idx = smpl_buf_idx + decisionDelay; 560 Gain_Q10 = Gains_Q16[psEncC->nb_subfr - 1] >> 6; 561 for (i = 0; i < decisionDelay; i++) 562 { 563 last_smple_idx = (last_smple_idx + DECISION_DELAY - 1) % DECISION_DELAY; 564 psSample = &psDelDec.Samples[last_smple_idx]; 565 566 pulses[i - decisionDelay] = 567 (opus_int8)silk_sar_round_32(silk_select_winner(psSample->Q_Q10, Winner_selector), 10); 568 pxq[i - decisionDelay] = 569 silk_sat16((opus_int32)silk_sar_round_smulww(silk_select_winner(psSample->Xq_Q14, Winner_selector), Gain_Q10, 8)); 570 NSQ->sLTP_shp_Q14[NSQ->sLTP_shp_buf_idx - decisionDelay + i] = 571 silk_select_winner(psSample->Shape_Q14, Winner_selector); 572 } 573 for (i = 0; i < NSQ_LPC_BUF_LENGTH; i++) 574 { 575 NSQ->sLPC_Q14[i] = silk_select_winner(psDelDec.sLPC_Q14[i], Winner_selector); 576 } 577 for (i = 0; i < MAX_SHAPE_LPC_ORDER; i++) 578 { 579 NSQ->sAR2_Q14[i] = silk_select_winner(psDelDec.sAR2_Q14[i], Winner_selector); 580 } 581 582 /* Update states */ 583 NSQ->sLF_AR_shp_Q14 = silk_select_winner(psDelDec.LF_AR_Q14, Winner_selector); 584 NSQ->sDiff_shp_Q14 = silk_select_winner(psDelDec.Diff_Q14, Winner_selector); 585 NSQ->lagPrev = pitchL[psEncC->nb_subfr - 1]; 586 587 /* Save quantized speech signal */ 588 silk_memmove(NSQ->xq, &NSQ->xq[psEncC->frame_length], psEncC->ltp_mem_length * sizeof(opus_int16)); 589 silk_memmove(NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[psEncC->frame_length], psEncC->ltp_mem_length * sizeof(opus_int32)); 590 591 #ifdef OPUS_CHECK_ASM 592 silk_assert(!memcmp(&NSQ_c, NSQ, sizeof(NSQ_c))); 593 silk_assert(!memcmp(&psIndices_c, psIndices, sizeof(psIndices_c))); 594 silk_assert(!memcmp(pulses_c, pulses_a, sizeof(pulses_c))); 595 #endif 596 597 RESTORE_STACK; 598 } 599 600 static OPUS_INLINE __m128i silk_noise_shape_quantizer_short_prediction_x4(const __m128i *buf32, const opus_int16 *coef16, opus_int order) 601 { 602 __m256i out; 603 silk_assert(order == 10 || order == 16); 604 605 /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */ 606 out = _mm256_set1_epi32(order >> 1); 607 out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-0]), _mm256_set1_epi32(silk_LSHIFT(coef16[0], 16)))); /* High DWORD */ 608 out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-1]), _mm256_set1_epi32(silk_LSHIFT(coef16[1], 16)))); /* High DWORD */ 609 out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-2]), _mm256_set1_epi32(silk_LSHIFT(coef16[2], 16)))); /* High DWORD */ 610 out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-3]), _mm256_set1_epi32(silk_LSHIFT(coef16[3], 16)))); /* High DWORD */ 611 out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-4]), _mm256_set1_epi32(silk_LSHIFT(coef16[4], 16)))); /* High DWORD */ 612 out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-5]), _mm256_set1_epi32(silk_LSHIFT(coef16[5], 16)))); /* High DWORD */ 613 out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-6]), _mm256_set1_epi32(silk_LSHIFT(coef16[6], 16)))); /* High DWORD */ 614 out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-7]), _mm256_set1_epi32(silk_LSHIFT(coef16[7], 16)))); /* High DWORD */ 615 out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-8]), _mm256_set1_epi32(silk_LSHIFT(coef16[8], 16)))); /* High DWORD */ 616 out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-9]), _mm256_set1_epi32(silk_LSHIFT(coef16[9], 16)))); /* High DWORD */ 617 618 if (order == 16) 619 { 620 out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-10]), _mm256_set1_epi32(silk_LSHIFT(coef16[10], 16)))); /* High DWORD */ 621 out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-11]), _mm256_set1_epi32(silk_LSHIFT(coef16[11], 16)))); /* High DWORD */ 622 out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-12]), _mm256_set1_epi32(silk_LSHIFT(coef16[12], 16)))); /* High DWORD */ 623 out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-13]), _mm256_set1_epi32(silk_LSHIFT(coef16[13], 16)))); /* High DWORD */ 624 out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-14]), _mm256_set1_epi32(silk_LSHIFT(coef16[14], 16)))); /* High DWORD */ 625 out = _mm256_add_epi32(out, _mm256_mul_epi32(_mm256_cvtepi32_epi64(buf32[-15]), _mm256_set1_epi32(silk_LSHIFT(coef16[15], 16)))); /* High DWORD */ 626 } 627 return silk_cvtepi64_epi32_high(out); 628 } 629 630 /******************************************/ 631 /* Noise shape quantizer for one subframe */ 632 /******************************************/ 633 static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_avx2( 634 silk_nsq_state *NSQ, /* I/O NSQ state */ 635 NSQ_del_dec_struct *psDelDec, /* I/O Delayed decision states */ 636 opus_int signalType, /* I Signal type */ 637 const opus_int32 x_Q10[], /* I */ 638 opus_int8 pulses[], /* O */ 639 opus_int16 xq[], /* O */ 640 opus_int32 sLTP_Q15[], /* I/O LTP filter state */ 641 opus_int32 delayedGain_Q10[DECISION_DELAY], /* I/O Gain delay buffer */ 642 const opus_int16 a_Q12[], /* I Short term prediction coefs */ 643 const opus_int16 b_Q14[], /* I Long term prediction coefs */ 644 const opus_int16 AR_shp_Q13[], /* I Noise shaping coefs */ 645 opus_int lag, /* I Pitch lag */ 646 opus_int32 HarmShapeFIRPacked_Q14, /* I */ 647 opus_int Tilt_Q14, /* I Spectral tilt */ 648 opus_int32 LF_shp_Q14, /* I */ 649 opus_int32 Gain_Q16, /* I */ 650 opus_int Lambda_Q10, /* I */ 651 opus_int offset_Q10, /* I */ 652 opus_int length, /* I Input length */ 653 opus_int subfr, /* I Subframe number */ 654 opus_int shapingLPCOrder, /* I Shaping LPC filter order */ 655 opus_int predictLPCOrder, /* I Prediction filter order */ 656 opus_int warping_Q16, /* I */ 657 __m128i MaskDelDec, /* I Mask of states in decision tree */ 658 opus_int *smpl_buf_idx, /* I/O Index to newest samples in buffers */ 659 opus_int decisionDelay /* I */ 660 ) 661 { 662 int i; 663 opus_int32 *shp_lag_ptr = &NSQ->sLTP_shp_Q14[NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2]; 664 opus_int32 *pred_lag_ptr = &sLTP_Q15[NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2]; 665 opus_int32 Gain_Q10 = Gain_Q16 >> 6; 666 667 for (i = 0; i < length; i++) 668 { 669 /* Perform common calculations used in all states */ 670 /* NSQ_sample_struct */ 671 /* Low 128 bits => 1st set */ 672 /* High 128 bits => 2nd set */ 673 int j; 674 __m256i SS_Q_Q10; 675 __m256i SS_RD_Q10; 676 __m256i SS_xq_Q14; 677 __m256i SS_LF_AR_Q14; 678 __m256i SS_Diff_Q14; 679 __m256i SS_sLTP_shp_Q14; 680 __m256i SS_LPC_exc_Q14; 681 __m256i exc_Q14; 682 __m256i q_Q10, rr_Q10, rd_Q10; 683 __m256i mask; 684 __m128i LPC_pred_Q14, n_AR_Q14; 685 __m128i RDmin_Q10, RDmax_Q10; 686 __m128i n_LF_Q14; 687 __m128i r_Q10, q1_Q0, q1_Q10, q2_Q10; 688 __m128i Winner_rand_state, Winner_selector; 689 __m128i tmp0, tmp1; 690 NSQ_del_dec_sample_struct *psLastSample, *psSample; 691 opus_int32 RDmin_ind, RDmax_ind, last_smple_idx; 692 opus_int32 LTP_pred_Q14, n_LTP_Q14; 693 694 /* Long-term prediction */ 695 if (signalType == TYPE_VOICED) 696 { 697 /* Unrolled loop */ 698 /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */ 699 LTP_pred_Q14 = 2; 700 LTP_pred_Q14 += silk_SMULWB(pred_lag_ptr[-0], b_Q14[0]); 701 LTP_pred_Q14 += silk_SMULWB(pred_lag_ptr[-1], b_Q14[1]); 702 LTP_pred_Q14 += silk_SMULWB(pred_lag_ptr[-2], b_Q14[2]); 703 LTP_pred_Q14 += silk_SMULWB(pred_lag_ptr[-3], b_Q14[3]); 704 LTP_pred_Q14 += silk_SMULWB(pred_lag_ptr[-4], b_Q14[4]); 705 LTP_pred_Q14 = silk_LSHIFT(LTP_pred_Q14, 1); /* Q13 -> Q14 */ 706 pred_lag_ptr++; 707 } 708 else 709 { 710 LTP_pred_Q14 = 0; 711 } 712 713 /* Long-term shaping */ 714 if (lag > 0) 715 { 716 /* Symmetric, packed FIR coefficients */ 717 n_LTP_Q14 = silk_add_sat32(shp_lag_ptr[0], shp_lag_ptr[-2]); 718 n_LTP_Q14 = silk_SMULWB(n_LTP_Q14, HarmShapeFIRPacked_Q14); 719 n_LTP_Q14 = n_LTP_Q14 + silk_SMULWT(shp_lag_ptr[-1], HarmShapeFIRPacked_Q14); 720 n_LTP_Q14 = LTP_pred_Q14 - (silk_LSHIFT(n_LTP_Q14, 2)); /* Q12 -> Q14 */ 721 shp_lag_ptr++; 722 } 723 else 724 { 725 n_LTP_Q14 = 0; 726 } 727 728 /* BEGIN Updating Delayed Decision States */ 729 730 /* Generate dither */ 731 psDelDec->Seed = silk_mm256_rand_epi32(psDelDec->Seed); 732 733 /* Short-term prediction */ 734 LPC_pred_Q14 = silk_noise_shape_quantizer_short_prediction_x4(&psDelDec->sLPC_Q14[NSQ_LPC_BUF_LENGTH - 1 + i], a_Q12, predictLPCOrder); 735 LPC_pred_Q14 = _mm_slli_epi32(LPC_pred_Q14, 4); /* Q10 -> Q14 */ 736 737 /* Noise shape feedback */ 738 silk_assert(shapingLPCOrder > 0); 739 silk_assert((shapingLPCOrder & 1) == 0); /* check that order is even */ 740 /* Output of lowpass section */ 741 tmp0 = _mm_add_epi32(psDelDec->Diff_Q14, silk_mm_smulwb_epi32(psDelDec->sAR2_Q14[0], warping_Q16)); 742 n_AR_Q14 = _mm_set1_epi32(shapingLPCOrder >> 1); 743 for (j = 0; j < shapingLPCOrder - 1; j++) 744 { 745 /* Output of allpass section */ 746 tmp1 = psDelDec->sAR2_Q14[j]; 747 psDelDec->sAR2_Q14[j] = tmp0; 748 n_AR_Q14 = _mm_add_epi32(n_AR_Q14, silk_mm_smulwb_epi32(tmp0, AR_shp_Q13[j])); 749 tmp0 = _mm_add_epi32(tmp1, silk_mm_smulwb_epi32(_mm_sub_epi32(psDelDec->sAR2_Q14[j + 1], tmp0), warping_Q16)); 750 } 751 psDelDec->sAR2_Q14[shapingLPCOrder - 1] = tmp0; 752 n_AR_Q14 = _mm_add_epi32(n_AR_Q14, silk_mm_smulwb_epi32(tmp0, AR_shp_Q13[shapingLPCOrder - 1])); 753 754 n_AR_Q14 = _mm_slli_epi32(n_AR_Q14, 1); /* Q11 -> Q12 */ 755 n_AR_Q14 = _mm_add_epi32(n_AR_Q14, silk_mm_smulwb_epi32(psDelDec->LF_AR_Q14, Tilt_Q14)); /* Q12 */ 756 n_AR_Q14 = _mm_slli_epi32(n_AR_Q14, 2); /* Q12 -> Q14 */ 757 758 tmp0 = silk_mm_smulwb_epi32(psDelDec->Samples[*smpl_buf_idx].Shape_Q14, LF_shp_Q14); /* Q12 */ 759 tmp1 = silk_mm_smulwb_epi32(psDelDec->LF_AR_Q14, LF_shp_Q14 >> 16); /* Q12 */ 760 n_LF_Q14 = _mm_add_epi32(tmp0, tmp1); /* Q12 */ 761 n_LF_Q14 = _mm_slli_epi32(n_LF_Q14, 2); /* Q12 -> Q14 */ 762 763 /* Input minus prediction plus noise feedback */ 764 /* r = x[ i ] - LTP_pred - LPC_pred + n_AR + n_Tilt + n_LF + n_LTP */ 765 tmp0 = silk_mm_add_sat_epi32(n_AR_Q14, n_LF_Q14); /* Q14 */ 766 tmp1 = _mm_add_epi32(_mm_set1_epi32(n_LTP_Q14), LPC_pred_Q14); /* Q13 */ 767 tmp0 = silk_mm_sub_sat_epi32(tmp1, tmp0); /* Q13 */ 768 tmp0 = silk_mm_srai_round_epi32(tmp0, 4); /* Q10 */ 769 770 r_Q10 = _mm_sub_epi32(_mm_set1_epi32(x_Q10[i]), tmp0); /* residual error Q10 */ 771 772 /* Flip sign depending on dither */ 773 r_Q10 = silk_mm_sign_epi32(r_Q10, psDelDec->Seed); 774 r_Q10 = silk_mm_limit_epi32(r_Q10, -(31 << 10), 30 << 10); 775 776 /* Find two quantization level candidates and measure their rate-distortion */ 777 q1_Q10 = _mm_sub_epi32(r_Q10, _mm_set1_epi32(offset_Q10)); 778 q1_Q0 = _mm_srai_epi32(q1_Q10, 10); 779 if (Lambda_Q10 > 2048) 780 { 781 /* For aggressive RDO, the bias becomes more than one pulse. */ 782 tmp0 = _mm_sub_epi32(_mm_abs_epi32(q1_Q10), _mm_set1_epi32(Lambda_Q10 / 2 - 512)); /* rdo_offset */ 783 q1_Q0 = _mm_srai_epi32(q1_Q10, 31); 784 tmp1 = _mm_cmpgt_epi32(tmp0, _mm_setzero_si128()); 785 tmp0 = _mm_srai_epi32(silk_mm_sign_epi32(tmp0, q1_Q10), 10); 786 q1_Q0 = _mm_blendv_epi8(q1_Q0, tmp0, tmp1); 787 } 788 789 tmp0 = _mm_sign_epi32(_mm_set1_epi32(QUANT_LEVEL_ADJUST_Q10), q1_Q0); 790 q1_Q10 = _mm_sub_epi32(_mm_slli_epi32(q1_Q0, 10), tmp0); 791 q1_Q10 = _mm_add_epi32(q1_Q10, _mm_set1_epi32(offset_Q10)); 792 793 /* check if q1_Q0 is 0 or -1 */ 794 tmp0 = _mm_add_epi32(_mm_srli_epi32(q1_Q0, 31), q1_Q0); 795 tmp1 = _mm_cmpeq_epi32(tmp0, _mm_setzero_si128()); 796 tmp0 = _mm_blendv_epi8(_mm_set1_epi32(1024), _mm_set1_epi32(1024 - QUANT_LEVEL_ADJUST_Q10), tmp1); 797 q2_Q10 = _mm_add_epi32(q1_Q10, tmp0); 798 q_Q10 = _mm256_set_m128i(q2_Q10, q1_Q10); 799 800 rr_Q10 = _mm256_sub_epi32(_mm256_broadcastsi128_si256(r_Q10), q_Q10); 801 rd_Q10 = _mm256_abs_epi32(q_Q10); 802 rr_Q10 = silk_mm256_smulbb_epi32(rr_Q10, rr_Q10); 803 rd_Q10 = silk_mm256_smulbb_epi32(rd_Q10, _mm256_set1_epi32(Lambda_Q10)); 804 rd_Q10 = _mm256_add_epi32(rd_Q10, rr_Q10); 805 rd_Q10 = _mm256_srai_epi32(rd_Q10, 10); 806 807 mask = _mm256_broadcastsi128_si256(_mm_cmplt_epi32(_mm256_extracti128_si256(rd_Q10, 0), _mm256_extracti128_si256(rd_Q10, 1))); 808 SS_RD_Q10 = _mm256_add_epi32( 809 _mm256_broadcastsi128_si256(psDelDec->RD_Q10), 810 _mm256_blendv_epi8( 811 _mm256_permute2x128_si256(rd_Q10, rd_Q10, 0x1), 812 rd_Q10, 813 mask)); 814 SS_Q_Q10 = _mm256_blendv_epi8( 815 _mm256_permute2x128_si256(q_Q10, q_Q10, 0x1), 816 q_Q10, 817 mask); 818 819 /* Update states for best and second best quantization */ 820 821 /* Quantized excitation */ 822 exc_Q14 = silk_mm256_sign_epi32(_mm256_slli_epi32(SS_Q_Q10, 4), _mm256_broadcastsi128_si256(psDelDec->Seed)); 823 824 /* Add predictions */ 825 exc_Q14 = _mm256_add_epi32(exc_Q14, _mm256_set1_epi32(LTP_pred_Q14)); 826 SS_LPC_exc_Q14 = _mm256_slli_epi32(exc_Q14, 1); 827 SS_xq_Q14 = _mm256_add_epi32(exc_Q14, _mm256_broadcastsi128_si256(LPC_pred_Q14)); 828 829 /* Update states */ 830 SS_Diff_Q14 = _mm256_sub_epi32(SS_xq_Q14, _mm256_set1_epi32(silk_LSHIFT(x_Q10[i], 4))); 831 SS_LF_AR_Q14 = _mm256_sub_epi32(SS_Diff_Q14, _mm256_broadcastsi128_si256(n_AR_Q14)); 832 SS_sLTP_shp_Q14 = silk_mm256_sub_sat_epi32(SS_LF_AR_Q14, _mm256_broadcastsi128_si256(n_LF_Q14)); 833 834 /* END Updating Delayed Decision States */ 835 836 *smpl_buf_idx = (*smpl_buf_idx + DECISION_DELAY - 1) % DECISION_DELAY; 837 last_smple_idx = (*smpl_buf_idx + decisionDelay) % DECISION_DELAY; 838 psLastSample = &psDelDec->Samples[last_smple_idx]; 839 840 /* Find winner */ 841 RDmin_Q10 = silk_mm_mask_hmin_epi32(_mm256_castsi256_si128(SS_RD_Q10), MaskDelDec); 842 Winner_selector = silk_index_to_selector(silk_index_of_first_equal_epi32(RDmin_Q10, _mm256_castsi256_si128(SS_RD_Q10))); 843 844 /* Increase RD values of expired states */ 845 Winner_rand_state = _mm_shuffle_epi8(psLastSample->RandState, Winner_selector); 846 847 SS_RD_Q10 = _mm256_blendv_epi8( 848 _mm256_add_epi32(SS_RD_Q10, _mm256_set1_epi32(silk_int32_MAX >> 4)), 849 SS_RD_Q10, 850 _mm256_broadcastsi128_si256(_mm_cmpeq_epi32(psLastSample->RandState, Winner_rand_state))); 851 852 /* find worst in first set */ 853 RDmax_Q10 = silk_mm_mask_hmax_epi32(_mm256_extracti128_si256(SS_RD_Q10, 0), MaskDelDec); 854 /* find best in second set */ 855 RDmin_Q10 = silk_mm_mask_hmin_epi32(_mm256_extracti128_si256(SS_RD_Q10, 1), MaskDelDec); 856 857 /* Replace a state if best from second set outperforms worst in first set */ 858 tmp0 = _mm_cmplt_epi32(RDmin_Q10, RDmax_Q10); 859 if (!_mm_test_all_zeros(tmp0, tmp0)) 860 { 861 int t; 862 RDmax_ind = silk_index_of_first_equal_epi32(RDmax_Q10, _mm256_extracti128_si256(SS_RD_Q10, 0)); 863 RDmin_ind = silk_index_of_first_equal_epi32(RDmin_Q10, _mm256_extracti128_si256(SS_RD_Q10, 1)); 864 tmp1 = _mm_cvtepi8_epi32(_mm_cvtsi32_si128(0xFFU << (unsigned)(RDmax_ind << 3))); 865 tmp0 = _mm_blendv_epi8( 866 _mm_set_epi8(0xF, 0xE, 0xD, 0xC, 0xB, 0xA, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0), 867 silk_index_to_selector(RDmin_ind), 868 tmp1); 869 for (t = i; t < MAX_SUB_FRAME_LENGTH + NSQ_LPC_BUF_LENGTH; t++) 870 { 871 psDelDec->sLPC_Q14[t] = _mm_shuffle_epi8(psDelDec->sLPC_Q14[t], tmp0); 872 } 873 psDelDec->Seed = _mm_shuffle_epi8(psDelDec->Seed, tmp0); 874 psDelDec->SeedInit = _mm_shuffle_epi8(psDelDec->SeedInit, tmp0); 875 for (t = 0; t < MAX_SHAPE_LPC_ORDER; t++) 876 { 877 psDelDec->sAR2_Q14[t] = _mm_shuffle_epi8(psDelDec->sAR2_Q14[t], tmp0); 878 } 879 for (t = 0; t < DECISION_DELAY; t++) 880 { 881 psDelDec->Samples[t].RandState = _mm_shuffle_epi8(psDelDec->Samples[t].RandState, tmp0); 882 psDelDec->Samples[t].Q_Q10 = _mm_shuffle_epi8(psDelDec->Samples[t].Q_Q10, tmp0); 883 psDelDec->Samples[t].Xq_Q14 = _mm_shuffle_epi8(psDelDec->Samples[t].Xq_Q14, tmp0); 884 psDelDec->Samples[t].Pred_Q15 = _mm_shuffle_epi8(psDelDec->Samples[t].Pred_Q15, tmp0); 885 psDelDec->Samples[t].Shape_Q14 = _mm_shuffle_epi8(psDelDec->Samples[t].Shape_Q14, tmp0); 886 } 887 mask = _mm256_castsi128_si256(_mm_blendv_epi8(_mm_set_epi32(0x3, 0x2, 0x1, 0x0), _mm_set1_epi32(RDmin_ind + 4), tmp1)); 888 SS_Q_Q10 = _mm256_permutevar8x32_epi32(SS_Q_Q10, mask); 889 SS_RD_Q10 = _mm256_permutevar8x32_epi32(SS_RD_Q10, mask); 890 SS_xq_Q14 = _mm256_permutevar8x32_epi32(SS_xq_Q14, mask); 891 SS_LF_AR_Q14 = _mm256_permutevar8x32_epi32(SS_LF_AR_Q14, mask); 892 SS_Diff_Q14 = _mm256_permutevar8x32_epi32(SS_Diff_Q14, mask); 893 SS_sLTP_shp_Q14 = _mm256_permutevar8x32_epi32(SS_sLTP_shp_Q14, mask); 894 SS_LPC_exc_Q14 = _mm256_permutevar8x32_epi32(SS_LPC_exc_Q14, mask); 895 } 896 897 /* Write samples from winner to output and long-term filter states */ 898 if (subfr > 0 || i >= decisionDelay) 899 { 900 pulses[i - decisionDelay] = 901 (opus_int8)silk_sar_round_32(silk_select_winner(psLastSample->Q_Q10, Winner_selector), 10); 902 xq[i - decisionDelay] = 903 silk_sat16((opus_int32)silk_sar_round_smulww(silk_select_winner(psLastSample->Xq_Q14, Winner_selector), delayedGain_Q10[last_smple_idx], 8)); 904 NSQ->sLTP_shp_Q14[NSQ->sLTP_shp_buf_idx - decisionDelay] = 905 silk_select_winner(psLastSample->Shape_Q14, Winner_selector); 906 sLTP_Q15[NSQ->sLTP_buf_idx - decisionDelay] = 907 silk_select_winner(psLastSample->Pred_Q15, Winner_selector); 908 } 909 NSQ->sLTP_shp_buf_idx++; 910 NSQ->sLTP_buf_idx++; 911 912 /* Update states */ 913 psSample = &psDelDec->Samples[*smpl_buf_idx]; 914 psDelDec->Seed = _mm_add_epi32(psDelDec->Seed, silk_mm_srai_round_epi32(_mm256_castsi256_si128(SS_Q_Q10), 10)); 915 psDelDec->LF_AR_Q14 = _mm256_castsi256_si128(SS_LF_AR_Q14); 916 psDelDec->Diff_Q14 = _mm256_castsi256_si128(SS_Diff_Q14); 917 psDelDec->sLPC_Q14[i + NSQ_LPC_BUF_LENGTH] = _mm256_castsi256_si128(SS_xq_Q14); 918 psDelDec->RD_Q10 = _mm256_castsi256_si128(SS_RD_Q10); 919 psSample->Xq_Q14 = _mm256_castsi256_si128(SS_xq_Q14); 920 psSample->Q_Q10 = _mm256_castsi256_si128(SS_Q_Q10); 921 psSample->Pred_Q15 = _mm256_castsi256_si128(SS_LPC_exc_Q14); 922 psSample->Shape_Q14 = _mm256_castsi256_si128(SS_sLTP_shp_Q14); 923 psSample->RandState = psDelDec->Seed; 924 delayedGain_Q10[*smpl_buf_idx] = Gain_Q10; 925 } 926 /* Update LPC states */ 927 for (i = 0; i < NSQ_LPC_BUF_LENGTH; i++) 928 { 929 psDelDec->sLPC_Q14[i] = (&psDelDec->sLPC_Q14[length])[i]; 930 } 931 } 932 933 static OPUS_INLINE void silk_nsq_del_dec_scale_states_avx2( 934 const silk_encoder_state *psEncC, /* I Encoder State */ 935 silk_nsq_state *NSQ, /* I/O NSQ state */ 936 NSQ_del_dec_struct *psDelDec, /* I/O Delayed decision states */ 937 const opus_int16 x16[], /* I Input */ 938 opus_int32 x_sc_Q10[MAX_SUB_FRAME_LENGTH], /* O Input scaled with 1/Gain in Q10 */ 939 const opus_int16 sLTP[], /* I Re-whitened LTP state in Q0 */ 940 opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */ 941 opus_int subfr, /* I Subframe number */ 942 const opus_int LTP_scale_Q14, /* I LTP state scaling */ 943 const opus_int32 Gains_Q16[MAX_NB_SUBFR], /* I */ 944 const opus_int pitchL[MAX_NB_SUBFR], /* I Pitch lag */ 945 const opus_int signal_type, /* I Signal type */ 946 const opus_int decisionDelay /* I Decision delay */ 947 ) 948 { 949 int i; 950 opus_int lag; 951 opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q26; 952 NSQ_del_dec_sample_struct *psSample; 953 954 lag = pitchL[subfr]; 955 inv_gain_Q31 = silk_INVERSE32_varQ(silk_max(Gains_Q16[subfr], 1), 47); 956 silk_assert(inv_gain_Q31 != 0); 957 958 /* Scale input */ 959 inv_gain_Q26 = silk_sar_round_32(inv_gain_Q31, 5); 960 for (i = 0; i < psEncC->subfr_length; i+=4) 961 { 962 __m256i x = _mm256_cvtepi16_epi64(_mm_loadu_si64(&x16[i])); 963 x = _mm256_slli_epi64(_mm256_mul_epi32(x, _mm256_set1_epi32(inv_gain_Q26)), 16); 964 _mm_storeu_si128((__m128i*)(void*)&x_sc_Q10[i], silk_cvtepi64_epi32_high(x)); 965 } 966 967 /* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */ 968 if (NSQ->rewhite_flag) 969 { 970 if (subfr == 0) 971 { 972 /* Do LTP downscaling */ 973 inv_gain_Q31 = silk_LSHIFT(silk_SMULWB(inv_gain_Q31, LTP_scale_Q14), 2); 974 } 975 for (i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx; i++) 976 { 977 silk_assert(i < MAX_FRAME_LENGTH); 978 sLTP_Q15[i] = silk_SMULWB(inv_gain_Q31, sLTP[i]); 979 } 980 } 981 982 /* Adjust for changing gain */ 983 if (Gains_Q16[subfr] != NSQ->prev_gain_Q16) 984 { 985 gain_adj_Q16 = silk_DIV32_varQ(NSQ->prev_gain_Q16, Gains_Q16[subfr], 16); 986 987 /* Scale long-term shaping state */ 988 for (i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sLTP_shp_buf_idx; i+=4) 989 { 990 opus_int32 *p = &NSQ->sLTP_shp_Q14[i]; 991 _mm_storeu_si128((__m128i*)(void*)p, silk_mm_smulww_epi32(_mm_loadu_si128((__m128i*)(void*)p), gain_adj_Q16)); 992 } 993 994 /* Scale long-term prediction state */ 995 if (signal_type == TYPE_VOICED && NSQ->rewhite_flag == 0) 996 { 997 for (i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx - decisionDelay; i++) 998 { 999 sLTP_Q15[i] = ((opus_int64)sLTP_Q15[i]) * ((opus_int64)gain_adj_Q16) >> 16; 1000 } 1001 } 1002 1003 /* Scale scalar states */ 1004 psDelDec->LF_AR_Q14 = silk_mm_smulww_epi32(psDelDec->LF_AR_Q14, gain_adj_Q16); 1005 psDelDec->Diff_Q14 = silk_mm_smulww_epi32(psDelDec->Diff_Q14, gain_adj_Q16); 1006 1007 /* Scale short-term prediction and shaping states */ 1008 for (i = 0; i < NSQ_LPC_BUF_LENGTH; i++) 1009 { 1010 psDelDec->sLPC_Q14[i] = silk_mm_smulww_epi32(psDelDec->sLPC_Q14[i], gain_adj_Q16); 1011 } 1012 for (i = 0; i < DECISION_DELAY; i++) 1013 { 1014 psSample = &psDelDec->Samples[i]; 1015 psSample->Pred_Q15 = silk_mm_smulww_epi32(psSample->Pred_Q15, gain_adj_Q16); 1016 psSample->Shape_Q14 = silk_mm_smulww_epi32(psSample->Shape_Q14, gain_adj_Q16); 1017 } 1018 for (i = 0; i < MAX_SHAPE_LPC_ORDER; i++) 1019 { 1020 psDelDec->sAR2_Q14[i] = silk_mm_smulww_epi32(psDelDec->sAR2_Q14[i], gain_adj_Q16); 1021 } 1022 1023 /* Save inverse gain */ 1024 NSQ->prev_gain_Q16 = Gains_Q16[subfr]; 1025 } 1026 } 1027 1028 static OPUS_INLINE void silk_LPC_analysis_filter_avx2( 1029 opus_int16 *out, /* O Output signal */ 1030 const opus_int16 *in, /* I Input signal */ 1031 const opus_int16 *B, /* I MA prediction coefficients, Q12 [order] */ 1032 const opus_int32 len, /* I Signal length */ 1033 const opus_int32 order /* I Filter order */ 1034 ) 1035 { 1036 int i; 1037 opus_int32 out32_Q12, out32; 1038 silk_assert(order == 10 || order == 16); 1039 1040 for(i = order; i < len; i++ ) 1041 { 1042 const opus_int16 *in_ptr = &in[ i ]; 1043 /* Allowing wrap around so that two wraps can cancel each other. The rare 1044 cases where the result wraps around can only be triggered by invalid streams*/ 1045 1046 __m256i in_v = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)(void*)&in_ptr[-8])); 1047 __m256i B_v = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)(void*)& B[0])); 1048 __m256i sum = _mm256_mullo_epi32(in_v, silk_mm256_reverse_epi32(B_v)); 1049 if (order > 10) 1050 { 1051 in_v = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)(void*)&in_ptr[-16])); 1052 B_v = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)(void*)&B [8])); 1053 B_v = silk_mm256_reverse_epi32(B_v); 1054 } 1055 else 1056 { 1057 in_v = _mm256_cvtepi16_epi32(_mm_loadu_si32(&in_ptr[-10])); 1058 B_v = _mm256_cvtepi16_epi32(_mm_loadu_si32(&B [8])); 1059 B_v = _mm256_shuffle_epi32(B_v, 0x01); 1060 } 1061 sum = _mm256_add_epi32(sum, _mm256_mullo_epi32(in_v, B_v)); 1062 1063 out32_Q12 = silk_mm256_hsum_epi32(sum); 1064 1065 /* Subtract prediction */ 1066 out32_Q12 = silk_SUB32_ovflw( silk_LSHIFT( (opus_int32)*in_ptr, 12 ), out32_Q12 ); 1067 1068 /* Scale to Q0 */ 1069 out32 = silk_sar_round_32(out32_Q12, 12); 1070 1071 /* Saturate output */ 1072 out[ i ] = silk_sat16(out32); 1073 } 1074 1075 /* Set first d output samples to zero */ 1076 silk_memset( out, 0, order * sizeof( opus_int16 ) ); 1077 }