burg_modified_FIX_sse4_1.c (22172B)
1 /* Copyright (c) 2014-2020, Cisco Systems, INC 2 Written by XiangMingZhu WeiZhou MinPeng YanWang FrancisQuiers 3 4 Redistribution and use in source and binary forms, with or without 5 modification, are permitted provided that the following conditions 6 are met: 7 8 - Redistributions of source code must retain the above copyright 9 notice, this list of conditions and the following disclaimer. 10 11 - Redistributions in binary form must reproduce the above copyright 12 notice, this list of conditions and the following disclaimer in the 13 documentation and/or other materials provided with the distribution. 14 15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 16 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER 19 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 23 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 24 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28 #ifdef HAVE_CONFIG_H 29 #include "config.h" 30 #endif 31 32 #include <xmmintrin.h> 33 #include <emmintrin.h> 34 #include <smmintrin.h> 35 36 #include "SigProc_FIX.h" 37 #include "define.h" 38 #include "tuning_parameters.h" 39 #include "pitch.h" 40 #include "celt/x86/x86cpu.h" 41 42 #define MAX_FRAME_SIZE 384 /* subfr_length * nb_subfr = ( 0.005 * 16000 + 16 ) * 4 = 384 */ 43 44 #define QA 25 45 #define N_BITS_HEAD_ROOM 3 46 #define MIN_RSHIFTS -16 47 #define MAX_RSHIFTS (32 - QA) 48 49 /* Compute reflection coefficients from input signal */ 50 void silk_burg_modified_sse4_1( 51 opus_int32 *res_nrg, /* O Residual energy */ 52 opus_int *res_nrg_Q, /* O Residual energy Q value */ 53 opus_int32 A_Q16[], /* O Prediction coefficients (length order) */ 54 const opus_int16 x[], /* I Input signal, length: nb_subfr * ( D + subfr_length ) */ 55 const opus_int32 minInvGain_Q30, /* I Inverse of max prediction gain */ 56 const opus_int subfr_length, /* I Input signal subframe length (incl. D preceding samples) */ 57 const opus_int nb_subfr, /* I Number of subframes stacked in x */ 58 const opus_int D, /* I Order */ 59 int arch /* I Run-time architecture */ 60 ) 61 { 62 opus_int k, n, s, lz, rshifts, reached_max_gain; 63 opus_int32 C0, num, nrg, rc_Q31, invGain_Q30, Atmp_QA, Atmp1, tmp1, tmp2, x1, x2; 64 const opus_int16 *x_ptr; 65 opus_int32 C_first_row[ SILK_MAX_ORDER_LPC ]; 66 opus_int32 C_last_row[ SILK_MAX_ORDER_LPC ]; 67 opus_int32 Af_QA[ SILK_MAX_ORDER_LPC ]; 68 opus_int32 CAf[ SILK_MAX_ORDER_LPC + 1 ]; 69 opus_int32 CAb[ SILK_MAX_ORDER_LPC + 1 ]; 70 opus_int32 xcorr[ SILK_MAX_ORDER_LPC ]; 71 opus_int64 C0_64; 72 73 __m128i FIRST_3210, LAST_3210, ATMP_3210, TMP1_3210, TMP2_3210, T1_3210, T2_3210, PTR_3210, SUBFR_3210, X1_3210, X2_3210; 74 __m128i CONST1 = _mm_set1_epi32(1); 75 76 celt_assert( subfr_length * nb_subfr <= MAX_FRAME_SIZE ); 77 78 /* Compute autocorrelations, added over subframes */ 79 C0_64 = silk_inner_prod16( x, x, subfr_length*nb_subfr, arch ); 80 lz = silk_CLZ64(C0_64); 81 rshifts = 32 + 1 + N_BITS_HEAD_ROOM - lz; 82 if (rshifts > MAX_RSHIFTS) rshifts = MAX_RSHIFTS; 83 if (rshifts < MIN_RSHIFTS) rshifts = MIN_RSHIFTS; 84 85 if (rshifts > 0) { 86 C0 = (opus_int32)silk_RSHIFT64(C0_64, rshifts ); 87 } else { 88 C0 = silk_LSHIFT32((opus_int32)C0_64, -rshifts ); 89 } 90 91 CAb[ 0 ] = CAf[ 0 ] = C0 + silk_SMMUL( SILK_FIX_CONST( FIND_LPC_COND_FAC, 32 ), C0 ) + 1; /* Q(-rshifts) */ 92 silk_memset( C_first_row, 0, SILK_MAX_ORDER_LPC * sizeof( opus_int32 ) ); 93 if( rshifts > 0 ) { 94 for( s = 0; s < nb_subfr; s++ ) { 95 x_ptr = x + s * subfr_length; 96 for( n = 1; n < D + 1; n++ ) { 97 C_first_row[ n - 1 ] += (opus_int32)silk_RSHIFT64( 98 silk_inner_prod16( x_ptr, x_ptr + n, subfr_length - n, arch ), rshifts ); 99 } 100 } 101 } else { 102 for( s = 0; s < nb_subfr; s++ ) { 103 int i; 104 opus_int32 d; 105 x_ptr = x + s * subfr_length; 106 celt_pitch_xcorr(x_ptr, x_ptr + 1, xcorr, subfr_length - D, D, arch ); 107 for( n = 1; n < D + 1; n++ ) { 108 for ( i = n + subfr_length - D, d = 0; i < subfr_length; i++ ) 109 d = MAC16_16( d, x_ptr[ i ], x_ptr[ i - n ] ); 110 xcorr[ n - 1 ] += d; 111 } 112 for( n = 1; n < D + 1; n++ ) { 113 C_first_row[ n - 1 ] += silk_LSHIFT32( xcorr[ n - 1 ], -rshifts ); 114 } 115 } 116 } 117 silk_memcpy( C_last_row, C_first_row, SILK_MAX_ORDER_LPC * sizeof( opus_int32 ) ); 118 119 /* Initialize */ 120 CAb[ 0 ] = CAf[ 0 ] = C0 + silk_SMMUL( SILK_FIX_CONST( FIND_LPC_COND_FAC, 32 ), C0 ) + 1; /* Q(-rshifts) */ 121 122 invGain_Q30 = (opus_int32)1 << 30; 123 reached_max_gain = 0; 124 for( n = 0; n < D; n++ ) { 125 /* Update first row of correlation matrix (without first element) */ 126 /* Update last row of correlation matrix (without last element, stored in reversed order) */ 127 /* Update C * Af */ 128 /* Update C * flipud(Af) (stored in reversed order) */ 129 if( rshifts > -2 ) { 130 for( s = 0; s < nb_subfr; s++ ) { 131 x_ptr = x + s * subfr_length; 132 x1 = -silk_LSHIFT32( (opus_int32)x_ptr[ n ], 16 - rshifts ); /* Q(16-rshifts) */ 133 x2 = -silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n - 1 ], 16 - rshifts ); /* Q(16-rshifts) */ 134 tmp1 = silk_LSHIFT32( (opus_int32)x_ptr[ n ], QA - 16 ); /* Q(QA-16) */ 135 tmp2 = silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n - 1 ], QA - 16 ); /* Q(QA-16) */ 136 for( k = 0; k < n; k++ ) { 137 C_first_row[ k ] = silk_SMLAWB( C_first_row[ k ], x1, x_ptr[ n - k - 1 ] ); /* Q( -rshifts ) */ 138 C_last_row[ k ] = silk_SMLAWB( C_last_row[ k ], x2, x_ptr[ subfr_length - n + k ] ); /* Q( -rshifts ) */ 139 Atmp_QA = Af_QA[ k ]; 140 tmp1 = silk_SMLAWB( tmp1, Atmp_QA, x_ptr[ n - k - 1 ] ); /* Q(QA-16) */ 141 tmp2 = silk_SMLAWB( tmp2, Atmp_QA, x_ptr[ subfr_length - n + k ] ); /* Q(QA-16) */ 142 } 143 tmp1 = silk_LSHIFT32( -tmp1, 32 - QA - rshifts ); /* Q(16-rshifts) */ 144 tmp2 = silk_LSHIFT32( -tmp2, 32 - QA - rshifts ); /* Q(16-rshifts) */ 145 for( k = 0; k <= n; k++ ) { 146 CAf[ k ] = silk_SMLAWB( CAf[ k ], tmp1, x_ptr[ n - k ] ); /* Q( -rshift ) */ 147 CAb[ k ] = silk_SMLAWB( CAb[ k ], tmp2, x_ptr[ subfr_length - n + k - 1 ] ); /* Q( -rshift ) */ 148 } 149 } 150 } else { 151 for( s = 0; s < nb_subfr; s++ ) { 152 x_ptr = x + s * subfr_length; 153 x1 = -silk_LSHIFT32( (opus_int32)x_ptr[ n ], -rshifts ); /* Q( -rshifts ) */ 154 x2 = -silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n - 1 ], -rshifts ); /* Q( -rshifts ) */ 155 tmp1 = silk_LSHIFT32( (opus_int32)x_ptr[ n ], 17 ); /* Q17 */ 156 tmp2 = silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n - 1 ], 17 ); /* Q17 */ 157 158 X1_3210 = _mm_set1_epi32( x1 ); 159 X2_3210 = _mm_set1_epi32( x2 ); 160 TMP1_3210 = _mm_setzero_si128(); 161 TMP2_3210 = _mm_setzero_si128(); 162 for( k = 0; k < n - 3; k += 4 ) { 163 PTR_3210 = OP_CVTEPI16_EPI32_M64( &x_ptr[ n - k - 1 - 3 ] ); 164 SUBFR_3210 = OP_CVTEPI16_EPI32_M64( &x_ptr[ subfr_length - n + k ] ); 165 FIRST_3210 = _mm_loadu_si128( (__m128i *)(void*)&C_first_row[ k ] ); 166 PTR_3210 = _mm_shuffle_epi32( PTR_3210, _MM_SHUFFLE( 0, 1, 2, 3 ) ); 167 LAST_3210 = _mm_loadu_si128( (__m128i *)(void*)&C_last_row[ k ] ); 168 ATMP_3210 = _mm_loadu_si128( (__m128i *)(void*)&Af_QA[ k ] ); 169 170 T1_3210 = _mm_mullo_epi32( PTR_3210, X1_3210 ); 171 T2_3210 = _mm_mullo_epi32( SUBFR_3210, X2_3210 ); 172 173 ATMP_3210 = _mm_srai_epi32( ATMP_3210, 7 ); 174 ATMP_3210 = _mm_add_epi32( ATMP_3210, CONST1 ); 175 ATMP_3210 = _mm_srai_epi32( ATMP_3210, 1 ); 176 177 FIRST_3210 = _mm_add_epi32( FIRST_3210, T1_3210 ); 178 LAST_3210 = _mm_add_epi32( LAST_3210, T2_3210 ); 179 180 PTR_3210 = _mm_mullo_epi32( ATMP_3210, PTR_3210 ); 181 SUBFR_3210 = _mm_mullo_epi32( ATMP_3210, SUBFR_3210 ); 182 183 _mm_storeu_si128( (__m128i *)(void*)&C_first_row[ k ], FIRST_3210 ); 184 _mm_storeu_si128( (__m128i *)(void*)&C_last_row[ k ], LAST_3210 ); 185 186 TMP1_3210 = _mm_add_epi32( TMP1_3210, PTR_3210 ); 187 TMP2_3210 = _mm_add_epi32( TMP2_3210, SUBFR_3210 ); 188 } 189 190 TMP1_3210 = _mm_add_epi32( TMP1_3210, _mm_unpackhi_epi64(TMP1_3210, TMP1_3210 ) ); 191 TMP2_3210 = _mm_add_epi32( TMP2_3210, _mm_unpackhi_epi64(TMP2_3210, TMP2_3210 ) ); 192 TMP1_3210 = _mm_add_epi32( TMP1_3210, _mm_shufflelo_epi16(TMP1_3210, 0x0E ) ); 193 TMP2_3210 = _mm_add_epi32( TMP2_3210, _mm_shufflelo_epi16(TMP2_3210, 0x0E ) ); 194 195 tmp1 += _mm_cvtsi128_si32( TMP1_3210 ); 196 tmp2 += _mm_cvtsi128_si32( TMP2_3210 ); 197 198 for( ; k < n; k++ ) { 199 C_first_row[ k ] = silk_MLA( C_first_row[ k ], x1, x_ptr[ n - k - 1 ] ); /* Q( -rshifts ) */ 200 C_last_row[ k ] = silk_MLA( C_last_row[ k ], x2, x_ptr[ subfr_length - n + k ] ); /* Q( -rshifts ) */ 201 Atmp1 = silk_RSHIFT_ROUND( Af_QA[ k ], QA - 17 ); /* Q17 */ 202 /* We sometimes get overflows in the multiplications (even beyond +/- 2^32), 203 but they cancel each other and the real result seems to always fit in a 32-bit 204 signed integer. This was determined experimentally, not theoretically (unfortunately). */ 205 tmp1 = silk_MLA_ovflw( tmp1, x_ptr[ n - k - 1 ], Atmp1 ); /* Q17 */ 206 tmp2 = silk_MLA_ovflw( tmp2, x_ptr[ subfr_length - n + k ], Atmp1 ); /* Q17 */ 207 } 208 209 tmp1 = -tmp1; /* Q17 */ 210 tmp2 = -tmp2; /* Q17 */ 211 212 { 213 __m128i xmm_tmp1, xmm_tmp2; 214 __m128i xmm_x_ptr_n_k_x2x0, xmm_x_ptr_n_k_x3x1; 215 __m128i xmm_x_ptr_sub_x2x0, xmm_x_ptr_sub_x3x1; 216 217 xmm_tmp1 = _mm_set1_epi32( tmp1 ); 218 xmm_tmp2 = _mm_set1_epi32( tmp2 ); 219 220 for( k = 0; k <= n - 3; k += 4 ) { 221 xmm_x_ptr_n_k_x2x0 = OP_CVTEPI16_EPI32_M64( &x_ptr[ n - k - 3 ] ); 222 xmm_x_ptr_sub_x2x0 = OP_CVTEPI16_EPI32_M64( &x_ptr[ subfr_length - n + k - 1 ] ); 223 224 xmm_x_ptr_n_k_x2x0 = _mm_shuffle_epi32( xmm_x_ptr_n_k_x2x0, _MM_SHUFFLE( 0, 1, 2, 3 ) ); 225 226 xmm_x_ptr_n_k_x2x0 = _mm_slli_epi32( xmm_x_ptr_n_k_x2x0, -rshifts - 1 ); 227 xmm_x_ptr_sub_x2x0 = _mm_slli_epi32( xmm_x_ptr_sub_x2x0, -rshifts - 1 ); 228 229 /* equal shift right 4 bytes, xmm_x_ptr_n_k_x3x1 = _mm_srli_si128(xmm_x_ptr_n_k_x2x0, 4)*/ 230 xmm_x_ptr_n_k_x3x1 = _mm_shuffle_epi32( xmm_x_ptr_n_k_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) ); 231 xmm_x_ptr_sub_x3x1 = _mm_shuffle_epi32( xmm_x_ptr_sub_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) ); 232 233 xmm_x_ptr_n_k_x2x0 = _mm_mul_epi32( xmm_x_ptr_n_k_x2x0, xmm_tmp1 ); 234 xmm_x_ptr_n_k_x3x1 = _mm_mul_epi32( xmm_x_ptr_n_k_x3x1, xmm_tmp1 ); 235 xmm_x_ptr_sub_x2x0 = _mm_mul_epi32( xmm_x_ptr_sub_x2x0, xmm_tmp2 ); 236 xmm_x_ptr_sub_x3x1 = _mm_mul_epi32( xmm_x_ptr_sub_x3x1, xmm_tmp2 ); 237 238 xmm_x_ptr_n_k_x2x0 = _mm_srli_epi64( xmm_x_ptr_n_k_x2x0, 16 ); 239 xmm_x_ptr_n_k_x3x1 = _mm_slli_epi64( xmm_x_ptr_n_k_x3x1, 16 ); 240 xmm_x_ptr_sub_x2x0 = _mm_srli_epi64( xmm_x_ptr_sub_x2x0, 16 ); 241 xmm_x_ptr_sub_x3x1 = _mm_slli_epi64( xmm_x_ptr_sub_x3x1, 16 ); 242 243 xmm_x_ptr_n_k_x2x0 = _mm_blend_epi16( xmm_x_ptr_n_k_x2x0, xmm_x_ptr_n_k_x3x1, 0xCC ); 244 xmm_x_ptr_sub_x2x0 = _mm_blend_epi16( xmm_x_ptr_sub_x2x0, xmm_x_ptr_sub_x3x1, 0xCC ); 245 246 X1_3210 = _mm_loadu_si128( (__m128i *)(void*)&CAf[ k ] ); 247 PTR_3210 = _mm_loadu_si128( (__m128i *)(void*)&CAb[ k ] ); 248 249 X1_3210 = _mm_add_epi32( X1_3210, xmm_x_ptr_n_k_x2x0 ); 250 PTR_3210 = _mm_add_epi32( PTR_3210, xmm_x_ptr_sub_x2x0 ); 251 252 _mm_storeu_si128( (__m128i *)(void*)&CAf[ k ], X1_3210 ); 253 _mm_storeu_si128( (__m128i *)(void*)&CAb[ k ], PTR_3210 ); 254 } 255 256 for( ; k <= n; k++ ) { 257 CAf[ k ] = silk_SMLAWW( CAf[ k ], tmp1, 258 silk_LSHIFT32( (opus_int32)x_ptr[ n - k ], -rshifts - 1 ) ); /* Q( -rshift ) */ 259 CAb[ k ] = silk_SMLAWW( CAb[ k ], tmp2, 260 silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n + k - 1 ], -rshifts - 1 ) ); /* Q( -rshift ) */ 261 } 262 } 263 } 264 } 265 266 /* Calculate nominator and denominator for the next order reflection (parcor) coefficient */ 267 tmp1 = C_first_row[ n ]; /* Q( -rshifts ) */ 268 tmp2 = C_last_row[ n ]; /* Q( -rshifts ) */ 269 num = 0; /* Q( -rshifts ) */ 270 nrg = silk_ADD32( CAb[ 0 ], CAf[ 0 ] ); /* Q( 1-rshifts ) */ 271 for( k = 0; k < n; k++ ) { 272 Atmp_QA = Af_QA[ k ]; 273 lz = silk_CLZ32( silk_abs( Atmp_QA ) ) - 1; 274 lz = silk_min( 32 - QA, lz ); 275 Atmp1 = silk_LSHIFT32( Atmp_QA, lz ); /* Q( QA + lz ) */ 276 277 tmp1 = silk_ADD_LSHIFT32( tmp1, silk_SMMUL( C_last_row[ n - k - 1 ], Atmp1 ), 32 - QA - lz ); /* Q( -rshifts ) */ 278 tmp2 = silk_ADD_LSHIFT32( tmp2, silk_SMMUL( C_first_row[ n - k - 1 ], Atmp1 ), 32 - QA - lz ); /* Q( -rshifts ) */ 279 num = silk_ADD_LSHIFT32( num, silk_SMMUL( CAb[ n - k ], Atmp1 ), 32 - QA - lz ); /* Q( -rshifts ) */ 280 nrg = silk_ADD_LSHIFT32( nrg, silk_SMMUL( silk_ADD32( CAb[ k + 1 ], CAf[ k + 1 ] ), 281 Atmp1 ), 32 - QA - lz ); /* Q( 1-rshifts ) */ 282 } 283 CAf[ n + 1 ] = tmp1; /* Q( -rshifts ) */ 284 CAb[ n + 1 ] = tmp2; /* Q( -rshifts ) */ 285 num = silk_ADD32( num, tmp2 ); /* Q( -rshifts ) */ 286 num = silk_LSHIFT32( -num, 1 ); /* Q( 1-rshifts ) */ 287 288 /* Calculate the next order reflection (parcor) coefficient */ 289 if( silk_abs( num ) < nrg ) { 290 rc_Q31 = silk_DIV32_varQ( num, nrg, 31 ); 291 } else { 292 rc_Q31 = ( num > 0 ) ? silk_int32_MAX : silk_int32_MIN; 293 } 294 295 /* Update inverse prediction gain */ 296 tmp1 = ( (opus_int32)1 << 30 ) - silk_SMMUL( rc_Q31, rc_Q31 ); 297 tmp1 = silk_LSHIFT( silk_SMMUL( invGain_Q30, tmp1 ), 2 ); 298 if( tmp1 <= minInvGain_Q30 ) { 299 /* Max prediction gain exceeded; set reflection coefficient such that max prediction gain is exactly hit */ 300 tmp2 = ( (opus_int32)1 << 30 ) - silk_DIV32_varQ( minInvGain_Q30, invGain_Q30, 30 ); /* Q30 */ 301 rc_Q31 = silk_SQRT_APPROX( tmp2 ); /* Q15 */ 302 if( rc_Q31 > 0 ) { 303 /* Newton-Raphson iteration */ 304 rc_Q31 = silk_RSHIFT32( rc_Q31 + silk_DIV32( tmp2, rc_Q31 ), 1 ); /* Q15 */ 305 rc_Q31 = silk_LSHIFT32( rc_Q31, 16 ); /* Q31 */ 306 if( num < 0 ) { 307 /* Ensure adjusted reflection coefficients has the original sign */ 308 rc_Q31 = -rc_Q31; 309 } 310 } 311 invGain_Q30 = minInvGain_Q30; 312 reached_max_gain = 1; 313 } else { 314 invGain_Q30 = tmp1; 315 } 316 317 /* Update the AR coefficients */ 318 for( k = 0; k < (n + 1) >> 1; k++ ) { 319 tmp1 = Af_QA[ k ]; /* QA */ 320 tmp2 = Af_QA[ n - k - 1 ]; /* QA */ 321 Af_QA[ k ] = silk_ADD_LSHIFT32( tmp1, silk_SMMUL( tmp2, rc_Q31 ), 1 ); /* QA */ 322 Af_QA[ n - k - 1 ] = silk_ADD_LSHIFT32( tmp2, silk_SMMUL( tmp1, rc_Q31 ), 1 ); /* QA */ 323 } 324 Af_QA[ n ] = silk_RSHIFT32( rc_Q31, 31 - QA ); /* QA */ 325 326 if( reached_max_gain ) { 327 /* Reached max prediction gain; set remaining coefficients to zero and exit loop */ 328 for( k = n + 1; k < D; k++ ) { 329 Af_QA[ k ] = 0; 330 } 331 break; 332 } 333 334 /* Update C * Af and C * Ab */ 335 for( k = 0; k <= n + 1; k++ ) { 336 tmp1 = CAf[ k ]; /* Q( -rshifts ) */ 337 tmp2 = CAb[ n - k + 1 ]; /* Q( -rshifts ) */ 338 CAf[ k ] = silk_ADD_LSHIFT32( tmp1, silk_SMMUL( tmp2, rc_Q31 ), 1 ); /* Q( -rshifts ) */ 339 CAb[ n - k + 1 ] = silk_ADD_LSHIFT32( tmp2, silk_SMMUL( tmp1, rc_Q31 ), 1 ); /* Q( -rshifts ) */ 340 } 341 } 342 343 if( reached_max_gain ) { 344 for( k = 0; k < D; k++ ) { 345 /* Scale coefficients */ 346 A_Q16[ k ] = -silk_RSHIFT_ROUND( Af_QA[ k ], QA - 16 ); 347 } 348 /* Subtract energy of preceding samples from C0 */ 349 if( rshifts > 0 ) { 350 for( s = 0; s < nb_subfr; s++ ) { 351 x_ptr = x + s * subfr_length; 352 C0 -= (opus_int32)silk_RSHIFT64( silk_inner_prod16( x_ptr, x_ptr, D, arch ), rshifts ); 353 } 354 } else { 355 for( s = 0; s < nb_subfr; s++ ) { 356 x_ptr = x + s * subfr_length; 357 C0 -= silk_LSHIFT32( silk_inner_prod_aligned( x_ptr, x_ptr, D, arch ), -rshifts ); 358 } 359 } 360 /* Approximate residual energy */ 361 *res_nrg = silk_LSHIFT( silk_SMMUL( invGain_Q30, C0 ), 2 ); 362 *res_nrg_Q = -rshifts; 363 } else { 364 /* Return residual energy */ 365 nrg = CAf[ 0 ]; /* Q( -rshifts ) */ 366 tmp1 = (opus_int32)1 << 16; /* Q16 */ 367 for( k = 0; k < D; k++ ) { 368 Atmp1 = silk_RSHIFT_ROUND( Af_QA[ k ], QA - 16 ); /* Q16 */ 369 nrg = silk_SMLAWW( nrg, CAf[ k + 1 ], Atmp1 ); /* Q( -rshifts ) */ 370 tmp1 = silk_SMLAWW( tmp1, Atmp1, Atmp1 ); /* Q16 */ 371 A_Q16[ k ] = -Atmp1; 372 } 373 *res_nrg = silk_SMLAWW( nrg, silk_SMMUL( SILK_FIX_CONST( FIND_LPC_COND_FAC, 32 ), C0 ), -tmp1 );/* Q( -rshifts ) */ 374 *res_nrg_Q = -rshifts; 375 } 376 377 #ifdef OPUS_CHECK_ASM 378 { 379 opus_int32 res_nrg_c = 0; 380 opus_int res_nrg_Q_c = 0; 381 opus_int32 A_Q16_c[ MAX_LPC_ORDER ] = {0}; 382 383 silk_burg_modified_c( 384 &res_nrg_c, 385 &res_nrg_Q_c, 386 A_Q16_c, 387 x, 388 minInvGain_Q30, 389 subfr_length, 390 nb_subfr, 391 D, 392 0 393 ); 394 395 silk_assert( *res_nrg == res_nrg_c ); 396 silk_assert( *res_nrg_Q == res_nrg_Q_c ); 397 silk_assert( !memcmp( A_Q16, A_Q16_c, D * sizeof( *A_Q16 ) ) ); 398 } 399 #endif 400 }