tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

burg_modified_FIX_sse4_1.c (22172B)


      1 /* Copyright (c) 2014-2020, Cisco Systems, INC
      2   Written by XiangMingZhu WeiZhou MinPeng YanWang FrancisQuiers
      3 
      4   Redistribution and use in source and binary forms, with or without
      5   modification, are permitted provided that the following conditions
      6   are met:
      7 
      8   - Redistributions of source code must retain the above copyright
      9   notice, this list of conditions and the following disclaimer.
     10 
     11   - Redistributions in binary form must reproduce the above copyright
     12   notice, this list of conditions and the following disclaimer in the
     13   documentation and/or other materials provided with the distribution.
     14 
     15   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     16   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     17   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     18   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
     19   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     20   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     21   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     22   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
     23   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
     24   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     25   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26 */
     27 
     28 #ifdef HAVE_CONFIG_H
     29 #include "config.h"
     30 #endif
     31 
     32 #include <xmmintrin.h>
     33 #include <emmintrin.h>
     34 #include <smmintrin.h>
     35 
     36 #include "SigProc_FIX.h"
     37 #include "define.h"
     38 #include "tuning_parameters.h"
     39 #include "pitch.h"
     40 #include "celt/x86/x86cpu.h"
     41 
     42 #define MAX_FRAME_SIZE              384             /* subfr_length * nb_subfr = ( 0.005 * 16000 + 16 ) * 4 = 384 */
     43 
     44 #define QA                          25
     45 #define N_BITS_HEAD_ROOM            3
     46 #define MIN_RSHIFTS                 -16
     47 #define MAX_RSHIFTS                 (32 - QA)
     48 
     49 /* Compute reflection coefficients from input signal */
     50 void silk_burg_modified_sse4_1(
     51    opus_int32                  *res_nrg,           /* O    Residual energy                                             */
     52    opus_int                    *res_nrg_Q,         /* O    Residual energy Q value                                     */
     53    opus_int32                  A_Q16[],            /* O    Prediction coefficients (length order)                      */
     54    const opus_int16            x[],                /* I    Input signal, length: nb_subfr * ( D + subfr_length )       */
     55    const opus_int32            minInvGain_Q30,     /* I    Inverse of max prediction gain                              */
     56    const opus_int              subfr_length,       /* I    Input signal subframe length (incl. D preceding samples)    */
     57    const opus_int              nb_subfr,           /* I    Number of subframes stacked in x                            */
     58    const opus_int              D,                  /* I    Order                                                       */
     59    int                         arch                /* I    Run-time architecture                                       */
     60 )
     61 {
     62    opus_int         k, n, s, lz, rshifts, reached_max_gain;
     63    opus_int32       C0, num, nrg, rc_Q31, invGain_Q30, Atmp_QA, Atmp1, tmp1, tmp2, x1, x2;
     64    const opus_int16 *x_ptr;
     65    opus_int32       C_first_row[ SILK_MAX_ORDER_LPC ];
     66    opus_int32       C_last_row[  SILK_MAX_ORDER_LPC ];
     67    opus_int32       Af_QA[       SILK_MAX_ORDER_LPC ];
     68    opus_int32       CAf[ SILK_MAX_ORDER_LPC + 1 ];
     69    opus_int32       CAb[ SILK_MAX_ORDER_LPC + 1 ];
     70    opus_int32       xcorr[ SILK_MAX_ORDER_LPC ];
     71    opus_int64       C0_64;
     72 
     73    __m128i FIRST_3210, LAST_3210, ATMP_3210, TMP1_3210, TMP2_3210, T1_3210, T2_3210, PTR_3210, SUBFR_3210, X1_3210, X2_3210;
     74    __m128i CONST1 = _mm_set1_epi32(1);
     75 
     76    celt_assert( subfr_length * nb_subfr <= MAX_FRAME_SIZE );
     77 
     78    /* Compute autocorrelations, added over subframes */
     79    C0_64 = silk_inner_prod16( x, x, subfr_length*nb_subfr, arch );
     80    lz = silk_CLZ64(C0_64);
     81    rshifts = 32 + 1 + N_BITS_HEAD_ROOM - lz;
     82    if (rshifts > MAX_RSHIFTS) rshifts = MAX_RSHIFTS;
     83    if (rshifts < MIN_RSHIFTS) rshifts = MIN_RSHIFTS;
     84 
     85    if (rshifts > 0) {
     86        C0 = (opus_int32)silk_RSHIFT64(C0_64, rshifts );
     87    } else {
     88        C0 = silk_LSHIFT32((opus_int32)C0_64, -rshifts );
     89    }
     90 
     91    CAb[ 0 ] = CAf[ 0 ] = C0 + silk_SMMUL( SILK_FIX_CONST( FIND_LPC_COND_FAC, 32 ), C0 ) + 1;                                /* Q(-rshifts) */
     92    silk_memset( C_first_row, 0, SILK_MAX_ORDER_LPC * sizeof( opus_int32 ) );
     93    if( rshifts > 0 ) {
     94        for( s = 0; s < nb_subfr; s++ ) {
     95            x_ptr = x + s * subfr_length;
     96            for( n = 1; n < D + 1; n++ ) {
     97                C_first_row[ n - 1 ] += (opus_int32)silk_RSHIFT64(
     98                    silk_inner_prod16( x_ptr, x_ptr + n, subfr_length - n, arch ), rshifts );
     99            }
    100        }
    101    } else {
    102        for( s = 0; s < nb_subfr; s++ ) {
    103            int i;
    104            opus_int32 d;
    105            x_ptr = x + s * subfr_length;
    106            celt_pitch_xcorr(x_ptr, x_ptr + 1, xcorr, subfr_length - D, D, arch );
    107            for( n = 1; n < D + 1; n++ ) {
    108               for ( i = n + subfr_length - D, d = 0; i < subfr_length; i++ )
    109                  d = MAC16_16( d, x_ptr[ i ], x_ptr[ i - n ] );
    110               xcorr[ n - 1 ] += d;
    111            }
    112            for( n = 1; n < D + 1; n++ ) {
    113                C_first_row[ n - 1 ] += silk_LSHIFT32( xcorr[ n - 1 ], -rshifts );
    114            }
    115        }
    116    }
    117    silk_memcpy( C_last_row, C_first_row, SILK_MAX_ORDER_LPC * sizeof( opus_int32 ) );
    118 
    119    /* Initialize */
    120    CAb[ 0 ] = CAf[ 0 ] = C0 + silk_SMMUL( SILK_FIX_CONST( FIND_LPC_COND_FAC, 32 ), C0 ) + 1;                                /* Q(-rshifts) */
    121 
    122    invGain_Q30 = (opus_int32)1 << 30;
    123    reached_max_gain = 0;
    124    for( n = 0; n < D; n++ ) {
    125        /* Update first row of correlation matrix (without first element) */
    126        /* Update last row of correlation matrix (without last element, stored in reversed order) */
    127        /* Update C * Af */
    128        /* Update C * flipud(Af) (stored in reversed order) */
    129        if( rshifts > -2 ) {
    130            for( s = 0; s < nb_subfr; s++ ) {
    131                x_ptr = x + s * subfr_length;
    132                x1  = -silk_LSHIFT32( (opus_int32)x_ptr[ n ],                    16 - rshifts );        /* Q(16-rshifts) */
    133                x2  = -silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n - 1 ], 16 - rshifts );        /* Q(16-rshifts) */
    134                tmp1 = silk_LSHIFT32( (opus_int32)x_ptr[ n ],                    QA - 16 );             /* Q(QA-16) */
    135                tmp2 = silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n - 1 ], QA - 16 );             /* Q(QA-16) */
    136                for( k = 0; k < n; k++ ) {
    137                    C_first_row[ k ] = silk_SMLAWB( C_first_row[ k ], x1, x_ptr[ n - k - 1 ]            ); /* Q( -rshifts ) */
    138                    C_last_row[ k ]  = silk_SMLAWB( C_last_row[ k ],  x2, x_ptr[ subfr_length - n + k ] ); /* Q( -rshifts ) */
    139                    Atmp_QA = Af_QA[ k ];
    140                    tmp1 = silk_SMLAWB( tmp1, Atmp_QA, x_ptr[ n - k - 1 ]            );                 /* Q(QA-16) */
    141                    tmp2 = silk_SMLAWB( tmp2, Atmp_QA, x_ptr[ subfr_length - n + k ] );                 /* Q(QA-16) */
    142                }
    143                tmp1 = silk_LSHIFT32( -tmp1, 32 - QA - rshifts );                                       /* Q(16-rshifts) */
    144                tmp2 = silk_LSHIFT32( -tmp2, 32 - QA - rshifts );                                       /* Q(16-rshifts) */
    145                for( k = 0; k <= n; k++ ) {
    146                    CAf[ k ] = silk_SMLAWB( CAf[ k ], tmp1, x_ptr[ n - k ]                    );        /* Q( -rshift ) */
    147                    CAb[ k ] = silk_SMLAWB( CAb[ k ], tmp2, x_ptr[ subfr_length - n + k - 1 ] );        /* Q( -rshift ) */
    148                }
    149            }
    150        } else {
    151            for( s = 0; s < nb_subfr; s++ ) {
    152                x_ptr = x + s * subfr_length;
    153                x1  = -silk_LSHIFT32( (opus_int32)x_ptr[ n ],                    -rshifts );            /* Q( -rshifts ) */
    154                x2  = -silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n - 1 ], -rshifts );            /* Q( -rshifts ) */
    155                tmp1 = silk_LSHIFT32( (opus_int32)x_ptr[ n ],                    17 );                  /* Q17 */
    156                tmp2 = silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n - 1 ], 17 );                  /* Q17 */
    157 
    158                X1_3210 = _mm_set1_epi32( x1 );
    159                X2_3210 = _mm_set1_epi32( x2 );
    160                TMP1_3210 = _mm_setzero_si128();
    161                TMP2_3210 = _mm_setzero_si128();
    162                for( k = 0; k < n - 3; k += 4 ) {
    163                    PTR_3210   = OP_CVTEPI16_EPI32_M64( &x_ptr[ n - k - 1 - 3 ] );
    164                    SUBFR_3210 = OP_CVTEPI16_EPI32_M64( &x_ptr[ subfr_length - n + k ] );
    165                    FIRST_3210 = _mm_loadu_si128( (__m128i *)(void*)&C_first_row[ k ] );
    166                    PTR_3210   = _mm_shuffle_epi32( PTR_3210,  _MM_SHUFFLE( 0, 1, 2, 3 ) );
    167                    LAST_3210  = _mm_loadu_si128( (__m128i *)(void*)&C_last_row[ k ] );
    168                    ATMP_3210  = _mm_loadu_si128( (__m128i *)(void*)&Af_QA[ k ] );
    169 
    170                    T1_3210 = _mm_mullo_epi32( PTR_3210, X1_3210 );
    171                    T2_3210 = _mm_mullo_epi32( SUBFR_3210, X2_3210 );
    172 
    173                    ATMP_3210 = _mm_srai_epi32( ATMP_3210, 7 );
    174                    ATMP_3210 = _mm_add_epi32( ATMP_3210, CONST1 );
    175                    ATMP_3210 = _mm_srai_epi32( ATMP_3210, 1 );
    176 
    177                    FIRST_3210 = _mm_add_epi32( FIRST_3210, T1_3210 );
    178                    LAST_3210 = _mm_add_epi32( LAST_3210, T2_3210 );
    179 
    180                    PTR_3210   = _mm_mullo_epi32( ATMP_3210, PTR_3210 );
    181                    SUBFR_3210   = _mm_mullo_epi32( ATMP_3210, SUBFR_3210 );
    182 
    183                    _mm_storeu_si128( (__m128i *)(void*)&C_first_row[ k ], FIRST_3210 );
    184                    _mm_storeu_si128( (__m128i *)(void*)&C_last_row[ k ], LAST_3210 );
    185 
    186                    TMP1_3210 = _mm_add_epi32( TMP1_3210, PTR_3210 );
    187                    TMP2_3210 = _mm_add_epi32( TMP2_3210, SUBFR_3210 );
    188                }
    189 
    190                TMP1_3210 = _mm_add_epi32( TMP1_3210, _mm_unpackhi_epi64(TMP1_3210, TMP1_3210 ) );
    191                TMP2_3210 = _mm_add_epi32( TMP2_3210, _mm_unpackhi_epi64(TMP2_3210, TMP2_3210 ) );
    192                TMP1_3210 = _mm_add_epi32( TMP1_3210, _mm_shufflelo_epi16(TMP1_3210, 0x0E ) );
    193                TMP2_3210 = _mm_add_epi32( TMP2_3210, _mm_shufflelo_epi16(TMP2_3210, 0x0E ) );
    194 
    195                tmp1 += _mm_cvtsi128_si32( TMP1_3210 );
    196                tmp2 += _mm_cvtsi128_si32( TMP2_3210 );
    197 
    198                for( ; k < n; k++ ) {
    199                    C_first_row[ k ] = silk_MLA( C_first_row[ k ], x1, x_ptr[ n - k - 1 ]            ); /* Q( -rshifts ) */
    200                    C_last_row[ k ]  = silk_MLA( C_last_row[ k ],  x2, x_ptr[ subfr_length - n + k ] ); /* Q( -rshifts ) */
    201                    Atmp1 = silk_RSHIFT_ROUND( Af_QA[ k ], QA - 17 );                                   /* Q17 */
    202                    /* We sometimes get overflows in the multiplications (even beyond +/- 2^32),
    203                       but they cancel each other and the real result seems to always fit in a 32-bit
    204                       signed integer. This was determined experimentally, not theoretically (unfortunately). */
    205                    tmp1 = silk_MLA_ovflw( tmp1, x_ptr[ n - k - 1 ],            Atmp1 );                      /* Q17 */
    206                    tmp2 = silk_MLA_ovflw( tmp2, x_ptr[ subfr_length - n + k ], Atmp1 );                      /* Q17 */
    207                }
    208 
    209                tmp1 = -tmp1;                /* Q17 */
    210                tmp2 = -tmp2;                /* Q17 */
    211 
    212                {
    213                    __m128i xmm_tmp1, xmm_tmp2;
    214                    __m128i xmm_x_ptr_n_k_x2x0, xmm_x_ptr_n_k_x3x1;
    215                    __m128i xmm_x_ptr_sub_x2x0, xmm_x_ptr_sub_x3x1;
    216 
    217                    xmm_tmp1 = _mm_set1_epi32( tmp1 );
    218                    xmm_tmp2 = _mm_set1_epi32( tmp2 );
    219 
    220                    for( k = 0; k <= n - 3; k += 4 ) {
    221                        xmm_x_ptr_n_k_x2x0 = OP_CVTEPI16_EPI32_M64( &x_ptr[ n - k - 3 ] );
    222                        xmm_x_ptr_sub_x2x0 = OP_CVTEPI16_EPI32_M64( &x_ptr[ subfr_length - n + k - 1 ] );
    223 
    224                        xmm_x_ptr_n_k_x2x0 = _mm_shuffle_epi32( xmm_x_ptr_n_k_x2x0, _MM_SHUFFLE( 0, 1, 2, 3 ) );
    225 
    226                        xmm_x_ptr_n_k_x2x0 = _mm_slli_epi32( xmm_x_ptr_n_k_x2x0, -rshifts - 1 );
    227                        xmm_x_ptr_sub_x2x0 = _mm_slli_epi32( xmm_x_ptr_sub_x2x0, -rshifts - 1 );
    228 
    229                        /* equal shift right 4 bytes, xmm_x_ptr_n_k_x3x1 = _mm_srli_si128(xmm_x_ptr_n_k_x2x0, 4)*/
    230                        xmm_x_ptr_n_k_x3x1 = _mm_shuffle_epi32( xmm_x_ptr_n_k_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
    231                        xmm_x_ptr_sub_x3x1 = _mm_shuffle_epi32( xmm_x_ptr_sub_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
    232 
    233                        xmm_x_ptr_n_k_x2x0 = _mm_mul_epi32( xmm_x_ptr_n_k_x2x0, xmm_tmp1 );
    234                        xmm_x_ptr_n_k_x3x1 = _mm_mul_epi32( xmm_x_ptr_n_k_x3x1, xmm_tmp1 );
    235                        xmm_x_ptr_sub_x2x0 = _mm_mul_epi32( xmm_x_ptr_sub_x2x0, xmm_tmp2 );
    236                        xmm_x_ptr_sub_x3x1 = _mm_mul_epi32( xmm_x_ptr_sub_x3x1, xmm_tmp2 );
    237 
    238                        xmm_x_ptr_n_k_x2x0 = _mm_srli_epi64( xmm_x_ptr_n_k_x2x0, 16 );
    239                        xmm_x_ptr_n_k_x3x1 = _mm_slli_epi64( xmm_x_ptr_n_k_x3x1, 16 );
    240                        xmm_x_ptr_sub_x2x0 = _mm_srli_epi64( xmm_x_ptr_sub_x2x0, 16 );
    241                        xmm_x_ptr_sub_x3x1 = _mm_slli_epi64( xmm_x_ptr_sub_x3x1, 16 );
    242 
    243                        xmm_x_ptr_n_k_x2x0 = _mm_blend_epi16( xmm_x_ptr_n_k_x2x0, xmm_x_ptr_n_k_x3x1, 0xCC );
    244                        xmm_x_ptr_sub_x2x0 = _mm_blend_epi16( xmm_x_ptr_sub_x2x0, xmm_x_ptr_sub_x3x1, 0xCC );
    245 
    246                        X1_3210  = _mm_loadu_si128( (__m128i *)(void*)&CAf[ k ] );
    247                        PTR_3210 = _mm_loadu_si128( (__m128i *)(void*)&CAb[ k ] );
    248 
    249                        X1_3210  = _mm_add_epi32( X1_3210, xmm_x_ptr_n_k_x2x0 );
    250                        PTR_3210 = _mm_add_epi32( PTR_3210, xmm_x_ptr_sub_x2x0 );
    251 
    252                        _mm_storeu_si128( (__m128i *)(void*)&CAf[ k ], X1_3210 );
    253                        _mm_storeu_si128( (__m128i *)(void*)&CAb[ k ], PTR_3210 );
    254                    }
    255 
    256                    for( ; k <= n; k++ ) {
    257                        CAf[ k ] = silk_SMLAWW( CAf[ k ], tmp1,
    258                            silk_LSHIFT32( (opus_int32)x_ptr[ n - k ], -rshifts - 1 ) );                    /* Q( -rshift ) */
    259                        CAb[ k ] = silk_SMLAWW( CAb[ k ], tmp2,
    260                            silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n + k - 1 ], -rshifts - 1 ) ); /* Q( -rshift ) */
    261                    }
    262                }
    263            }
    264        }
    265 
    266        /* Calculate nominator and denominator for the next order reflection (parcor) coefficient */
    267        tmp1 = C_first_row[ n ];                                                                        /* Q( -rshifts ) */
    268        tmp2 = C_last_row[ n ];                                                                         /* Q( -rshifts ) */
    269        num  = 0;                                                                                       /* Q( -rshifts ) */
    270        nrg  = silk_ADD32( CAb[ 0 ], CAf[ 0 ] );                                                        /* Q( 1-rshifts ) */
    271        for( k = 0; k < n; k++ ) {
    272            Atmp_QA = Af_QA[ k ];
    273            lz = silk_CLZ32( silk_abs( Atmp_QA ) ) - 1;
    274            lz = silk_min( 32 - QA, lz );
    275            Atmp1 = silk_LSHIFT32( Atmp_QA, lz );                                                       /* Q( QA + lz ) */
    276 
    277            tmp1 = silk_ADD_LSHIFT32( tmp1, silk_SMMUL( C_last_row[  n - k - 1 ], Atmp1 ), 32 - QA - lz );  /* Q( -rshifts ) */
    278            tmp2 = silk_ADD_LSHIFT32( tmp2, silk_SMMUL( C_first_row[ n - k - 1 ], Atmp1 ), 32 - QA - lz );  /* Q( -rshifts ) */
    279            num  = silk_ADD_LSHIFT32( num,  silk_SMMUL( CAb[ n - k ],             Atmp1 ), 32 - QA - lz );  /* Q( -rshifts ) */
    280            nrg  = silk_ADD_LSHIFT32( nrg,  silk_SMMUL( silk_ADD32( CAb[ k + 1 ], CAf[ k + 1 ] ),
    281                                                                                Atmp1 ), 32 - QA - lz );    /* Q( 1-rshifts ) */
    282        }
    283        CAf[ n + 1 ] = tmp1;                                                                            /* Q( -rshifts ) */
    284        CAb[ n + 1 ] = tmp2;                                                                            /* Q( -rshifts ) */
    285        num = silk_ADD32( num, tmp2 );                                                                  /* Q( -rshifts ) */
    286        num = silk_LSHIFT32( -num, 1 );                                                                 /* Q( 1-rshifts ) */
    287 
    288        /* Calculate the next order reflection (parcor) coefficient */
    289        if( silk_abs( num ) < nrg ) {
    290            rc_Q31 = silk_DIV32_varQ( num, nrg, 31 );
    291        } else {
    292            rc_Q31 = ( num > 0 ) ? silk_int32_MAX : silk_int32_MIN;
    293        }
    294 
    295        /* Update inverse prediction gain */
    296        tmp1 = ( (opus_int32)1 << 30 ) - silk_SMMUL( rc_Q31, rc_Q31 );
    297        tmp1 = silk_LSHIFT( silk_SMMUL( invGain_Q30, tmp1 ), 2 );
    298        if( tmp1 <= minInvGain_Q30 ) {
    299            /* Max prediction gain exceeded; set reflection coefficient such that max prediction gain is exactly hit */
    300            tmp2 = ( (opus_int32)1 << 30 ) - silk_DIV32_varQ( minInvGain_Q30, invGain_Q30, 30 );            /* Q30 */
    301            rc_Q31 = silk_SQRT_APPROX( tmp2 );                                                  /* Q15 */
    302            if( rc_Q31 > 0 ) {
    303                 /* Newton-Raphson iteration */
    304                rc_Q31 = silk_RSHIFT32( rc_Q31 + silk_DIV32( tmp2, rc_Q31 ), 1 );                   /* Q15 */
    305                rc_Q31 = silk_LSHIFT32( rc_Q31, 16 );                                               /* Q31 */
    306                if( num < 0 ) {
    307                    /* Ensure adjusted reflection coefficients has the original sign */
    308                    rc_Q31 = -rc_Q31;
    309                }
    310            }
    311            invGain_Q30 = minInvGain_Q30;
    312            reached_max_gain = 1;
    313        } else {
    314            invGain_Q30 = tmp1;
    315        }
    316 
    317        /* Update the AR coefficients */
    318        for( k = 0; k < (n + 1) >> 1; k++ ) {
    319            tmp1 = Af_QA[ k ];                                                                  /* QA */
    320            tmp2 = Af_QA[ n - k - 1 ];                                                          /* QA */
    321            Af_QA[ k ]         = silk_ADD_LSHIFT32( tmp1, silk_SMMUL( tmp2, rc_Q31 ), 1 );      /* QA */
    322            Af_QA[ n - k - 1 ] = silk_ADD_LSHIFT32( tmp2, silk_SMMUL( tmp1, rc_Q31 ), 1 );      /* QA */
    323        }
    324        Af_QA[ n ] = silk_RSHIFT32( rc_Q31, 31 - QA );                                          /* QA */
    325 
    326        if( reached_max_gain ) {
    327            /* Reached max prediction gain; set remaining coefficients to zero and exit loop */
    328            for( k = n + 1; k < D; k++ ) {
    329                Af_QA[ k ] = 0;
    330            }
    331            break;
    332        }
    333 
    334        /* Update C * Af and C * Ab */
    335        for( k = 0; k <= n + 1; k++ ) {
    336            tmp1 = CAf[ k ];                                                                    /* Q( -rshifts ) */
    337            tmp2 = CAb[ n - k + 1 ];                                                            /* Q( -rshifts ) */
    338            CAf[ k ]         = silk_ADD_LSHIFT32( tmp1, silk_SMMUL( tmp2, rc_Q31 ), 1 );        /* Q( -rshifts ) */
    339            CAb[ n - k + 1 ] = silk_ADD_LSHIFT32( tmp2, silk_SMMUL( tmp1, rc_Q31 ), 1 );        /* Q( -rshifts ) */
    340        }
    341    }
    342 
    343    if( reached_max_gain ) {
    344        for( k = 0; k < D; k++ ) {
    345            /* Scale coefficients */
    346            A_Q16[ k ] = -silk_RSHIFT_ROUND( Af_QA[ k ], QA - 16 );
    347        }
    348        /* Subtract energy of preceding samples from C0 */
    349        if( rshifts > 0 ) {
    350            for( s = 0; s < nb_subfr; s++ ) {
    351                x_ptr = x + s * subfr_length;
    352                C0 -= (opus_int32)silk_RSHIFT64( silk_inner_prod16( x_ptr, x_ptr, D, arch ), rshifts );
    353            }
    354        } else {
    355            for( s = 0; s < nb_subfr; s++ ) {
    356                x_ptr = x + s * subfr_length;
    357                C0 -= silk_LSHIFT32( silk_inner_prod_aligned( x_ptr, x_ptr, D, arch ), -rshifts );
    358            }
    359        }
    360        /* Approximate residual energy */
    361        *res_nrg = silk_LSHIFT( silk_SMMUL( invGain_Q30, C0 ), 2 );
    362        *res_nrg_Q = -rshifts;
    363    } else {
    364        /* Return residual energy */
    365        nrg  = CAf[ 0 ];                                                                            /* Q( -rshifts ) */
    366        tmp1 = (opus_int32)1 << 16;                                                                             /* Q16 */
    367        for( k = 0; k < D; k++ ) {
    368            Atmp1 = silk_RSHIFT_ROUND( Af_QA[ k ], QA - 16 );                                       /* Q16 */
    369            nrg  = silk_SMLAWW( nrg, CAf[ k + 1 ], Atmp1 );                                         /* Q( -rshifts ) */
    370            tmp1 = silk_SMLAWW( tmp1, Atmp1, Atmp1 );                                               /* Q16 */
    371            A_Q16[ k ] = -Atmp1;
    372        }
    373        *res_nrg = silk_SMLAWW( nrg, silk_SMMUL( SILK_FIX_CONST( FIND_LPC_COND_FAC, 32 ), C0 ), -tmp1 );/* Q( -rshifts ) */
    374        *res_nrg_Q = -rshifts;
    375    }
    376 
    377 #ifdef OPUS_CHECK_ASM
    378    {
    379        opus_int32 res_nrg_c = 0;
    380        opus_int res_nrg_Q_c = 0;
    381        opus_int32 A_Q16_c[ MAX_LPC_ORDER ] = {0};
    382 
    383        silk_burg_modified_c(
    384            &res_nrg_c,
    385            &res_nrg_Q_c,
    386            A_Q16_c,
    387            x,
    388            minInvGain_Q30,
    389            subfr_length,
    390            nb_subfr,
    391            D,
    392            0
    393        );
    394 
    395        silk_assert( *res_nrg == res_nrg_c );
    396        silk_assert( *res_nrg_Q == res_nrg_Q_c );
    397        silk_assert( !memcmp( A_Q16, A_Q16_c, D * sizeof( *A_Q16 ) ) );
    398    }
    399 #endif
    400 }