tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

VAD.c (15007B)


      1 /***********************************************************************
      2 Copyright (c) 2006-2011, Skype Limited. All rights reserved.
      3 Redistribution and use in source and binary forms, with or without
      4 modification, are permitted provided that the following conditions
      5 are met:
      6 - Redistributions of source code must retain the above copyright notice,
      7 this list of conditions and the following disclaimer.
      8 - Redistributions in binary form must reproduce the above copyright
      9 notice, this list of conditions and the following disclaimer in the
     10 documentation and/or other materials provided with the distribution.
     11 - Neither the name of Internet Society, IETF or IETF Trust, nor the
     12 names of specific contributors, may be used to endorse or promote
     13 products derived from this software without specific prior written
     14 permission.
     15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
     16 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     17 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     18 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
     19 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
     20 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     21 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
     22 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     23 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     24 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
     25 POSSIBILITY OF SUCH DAMAGE.
     26 ***********************************************************************/
     27 
     28 #ifdef HAVE_CONFIG_H
     29 #include "config.h"
     30 #endif
     31 
     32 #include "main.h"
     33 #include "stack_alloc.h"
     34 
     35 /* Silk VAD noise level estimation */
     36 # if !defined(OPUS_X86_MAY_HAVE_SSE4_1)
     37 static OPUS_INLINE void silk_VAD_GetNoiseLevels(
     38    const opus_int32             pX[ VAD_N_BANDS ], /* I    subband energies                            */
     39    silk_VAD_state              *psSilk_VAD         /* I/O  Pointer to Silk VAD state                   */
     40 );
     41 #endif
     42 
     43 /**********************************/
     44 /* Initialization of the Silk VAD */
     45 /**********************************/
     46 opus_int silk_VAD_Init(                                         /* O    Return value, 0 if success                  */
     47    silk_VAD_state              *psSilk_VAD                     /* I/O  Pointer to Silk VAD state                   */
     48 )
     49 {
     50    opus_int b, ret = 0;
     51 
     52    /* reset state memory */
     53    silk_memset( psSilk_VAD, 0, sizeof( silk_VAD_state ) );
     54 
     55    /* init noise levels */
     56    /* Initialize array with approx pink noise levels (psd proportional to inverse of frequency) */
     57    for( b = 0; b < VAD_N_BANDS; b++ ) {
     58        psSilk_VAD->NoiseLevelBias[ b ] = silk_max_32( silk_DIV32_16( VAD_NOISE_LEVELS_BIAS, b + 1 ), 1 );
     59    }
     60 
     61    /* Initialize state */
     62    for( b = 0; b < VAD_N_BANDS; b++ ) {
     63        psSilk_VAD->NL[ b ]     = silk_MUL( 100, psSilk_VAD->NoiseLevelBias[ b ] );
     64        psSilk_VAD->inv_NL[ b ] = silk_DIV32( silk_int32_MAX, psSilk_VAD->NL[ b ] );
     65    }
     66    psSilk_VAD->counter = 15;
     67 
     68    /* init smoothed energy-to-noise ratio*/
     69    for( b = 0; b < VAD_N_BANDS; b++ ) {
     70        psSilk_VAD->NrgRatioSmth_Q8[ b ] = 100 * 256;       /* 100 * 256 --> 20 dB SNR */
     71    }
     72 
     73    return( ret );
     74 }
     75 
     76 /* Weighting factors for tilt measure */
     77 static const opus_int32 tiltWeights[ VAD_N_BANDS ] = { 30000, 6000, -12000, -12000 };
     78 
     79 /***************************************/
     80 /* Get the speech activity level in Q8 */
     81 /***************************************/
     82 opus_int silk_VAD_GetSA_Q8_c(                                   /* O    Return value, 0 if success                  */
     83    silk_encoder_state          *psEncC,                        /* I/O  Encoder state                               */
     84    const opus_int16            pIn[]                           /* I    PCM input                                   */
     85 )
     86 {
     87    opus_int   SA_Q15, pSNR_dB_Q7, input_tilt;
     88    opus_int   decimated_framelength1, decimated_framelength2;
     89    opus_int   decimated_framelength;
     90    opus_int   dec_subframe_length, dec_subframe_offset, SNR_Q7, i, b, s;
     91    opus_int32 sumSquared, smooth_coef_Q16;
     92    opus_int16 HPstateTmp;
     93    VARDECL( opus_int16, X );
     94    opus_int32 Xnrg[ VAD_N_BANDS ];
     95    opus_int32 NrgToNoiseRatio_Q8[ VAD_N_BANDS ];
     96    opus_int32 speech_nrg, x_tmp;
     97    opus_int   X_offset[ VAD_N_BANDS ];
     98    opus_int   ret = 0;
     99    silk_VAD_state *psSilk_VAD = &psEncC->sVAD;
    100    SAVE_STACK;
    101 
    102    /* Safety checks */
    103    silk_assert( VAD_N_BANDS == 4 );
    104    celt_assert( MAX_FRAME_LENGTH >= psEncC->frame_length );
    105    celt_assert( psEncC->frame_length <= 512 );
    106    celt_assert( psEncC->frame_length == 8 * silk_RSHIFT( psEncC->frame_length, 3 ) );
    107 
    108    /***********************/
    109    /* Filter and Decimate */
    110    /***********************/
    111    decimated_framelength1 = silk_RSHIFT( psEncC->frame_length, 1 );
    112    decimated_framelength2 = silk_RSHIFT( psEncC->frame_length, 2 );
    113    decimated_framelength = silk_RSHIFT( psEncC->frame_length, 3 );
    114    /* Decimate into 4 bands:
    115       0       L      3L       L              3L                             5L
    116               -      --       -              --                             --
    117               8       8       2               4                              4
    118 
    119       [0-1 kHz| temp. |1-2 kHz|    2-4 kHz    |            4-8 kHz           |
    120 
    121       They're arranged to allow the minimal ( frame_length / 4 ) extra
    122       scratch space during the downsampling process */
    123    X_offset[ 0 ] = 0;
    124    X_offset[ 1 ] = decimated_framelength + decimated_framelength2;
    125    X_offset[ 2 ] = X_offset[ 1 ] + decimated_framelength;
    126    X_offset[ 3 ] = X_offset[ 2 ] + decimated_framelength2;
    127    ALLOC( X, X_offset[ 3 ] + decimated_framelength1, opus_int16 );
    128 
    129    /* 0-8 kHz to 0-4 kHz and 4-8 kHz */
    130    silk_ana_filt_bank_1( pIn, &psSilk_VAD->AnaState[  0 ],
    131        X, &X[ X_offset[ 3 ] ], psEncC->frame_length );
    132 
    133    /* 0-4 kHz to 0-2 kHz and 2-4 kHz */
    134    silk_ana_filt_bank_1( X, &psSilk_VAD->AnaState1[ 0 ],
    135        X, &X[ X_offset[ 2 ] ], decimated_framelength1 );
    136 
    137    /* 0-2 kHz to 0-1 kHz and 1-2 kHz */
    138    silk_ana_filt_bank_1( X, &psSilk_VAD->AnaState2[ 0 ],
    139        X, &X[ X_offset[ 1 ] ], decimated_framelength2 );
    140 
    141    /*********************************************/
    142    /* HP filter on lowest band (differentiator) */
    143    /*********************************************/
    144    X[ decimated_framelength - 1 ] = silk_RSHIFT( X[ decimated_framelength - 1 ], 1 );
    145    HPstateTmp = X[ decimated_framelength - 1 ];
    146    for( i = decimated_framelength - 1; i > 0; i-- ) {
    147        X[ i - 1 ]  = silk_RSHIFT( X[ i - 1 ], 1 );
    148        X[ i ]     -= X[ i - 1 ];
    149    }
    150    X[ 0 ] -= psSilk_VAD->HPstate;
    151    psSilk_VAD->HPstate = HPstateTmp;
    152 
    153    /*************************************/
    154    /* Calculate the energy in each band */
    155    /*************************************/
    156    for( b = 0; b < VAD_N_BANDS; b++ ) {
    157        /* Find the decimated framelength in the non-uniformly divided bands */
    158        decimated_framelength = silk_RSHIFT( psEncC->frame_length, silk_min_int( VAD_N_BANDS - b, VAD_N_BANDS - 1 ) );
    159 
    160        /* Split length into subframe lengths */
    161        dec_subframe_length = silk_RSHIFT( decimated_framelength, VAD_INTERNAL_SUBFRAMES_LOG2 );
    162        dec_subframe_offset = 0;
    163 
    164        /* Compute energy per sub-frame */
    165        /* initialize with summed energy of last subframe */
    166        Xnrg[ b ] = psSilk_VAD->XnrgSubfr[ b ];
    167        for( s = 0; s < VAD_INTERNAL_SUBFRAMES; s++ ) {
    168            sumSquared = 0;
    169            for( i = 0; i < dec_subframe_length; i++ ) {
    170                /* The energy will be less than dec_subframe_length * ( silk_int16_MIN / 8 ) ^ 2.            */
    171                /* Therefore we can accumulate with no risk of overflow (unless dec_subframe_length > 128)  */
    172                x_tmp = silk_RSHIFT(
    173                    X[ X_offset[ b ] + i + dec_subframe_offset ], 3 );
    174                sumSquared = silk_SMLABB( sumSquared, x_tmp, x_tmp );
    175 
    176                /* Safety check */
    177                silk_assert( sumSquared >= 0 );
    178            }
    179 
    180            /* Add/saturate summed energy of current subframe */
    181            if( s < VAD_INTERNAL_SUBFRAMES - 1 ) {
    182                Xnrg[ b ] = silk_ADD_POS_SAT32( Xnrg[ b ], sumSquared );
    183            } else {
    184                /* Look-ahead subframe */
    185                Xnrg[ b ] = silk_ADD_POS_SAT32( Xnrg[ b ], silk_RSHIFT( sumSquared, 1 ) );
    186            }
    187 
    188            dec_subframe_offset += dec_subframe_length;
    189        }
    190        psSilk_VAD->XnrgSubfr[ b ] = sumSquared;
    191    }
    192 
    193    /********************/
    194    /* Noise estimation */
    195    /********************/
    196    silk_VAD_GetNoiseLevels( &Xnrg[ 0 ], psSilk_VAD );
    197 
    198    /***********************************************/
    199    /* Signal-plus-noise to noise ratio estimation */
    200    /***********************************************/
    201    sumSquared = 0;
    202    input_tilt = 0;
    203    for( b = 0; b < VAD_N_BANDS; b++ ) {
    204        speech_nrg = Xnrg[ b ] - psSilk_VAD->NL[ b ];
    205        if( speech_nrg > 0 ) {
    206            /* Divide, with sufficient resolution */
    207            if( ( Xnrg[ b ] & 0xFF800000 ) == 0 ) {
    208                NrgToNoiseRatio_Q8[ b ] = silk_DIV32( silk_LSHIFT( Xnrg[ b ], 8 ), psSilk_VAD->NL[ b ] + 1 );
    209            } else {
    210                NrgToNoiseRatio_Q8[ b ] = silk_DIV32( Xnrg[ b ], silk_RSHIFT( psSilk_VAD->NL[ b ], 8 ) + 1 );
    211            }
    212 
    213            /* Convert to log domain */
    214            SNR_Q7 = silk_lin2log( NrgToNoiseRatio_Q8[ b ] ) - 8 * 128;
    215 
    216            /* Sum-of-squares */
    217            sumSquared = silk_SMLABB( sumSquared, SNR_Q7, SNR_Q7 );          /* Q14 */
    218 
    219            /* Tilt measure */
    220            if( speech_nrg < ( (opus_int32)1 << 20 ) ) {
    221                /* Scale down SNR value for small subband speech energies */
    222                SNR_Q7 = silk_SMULWB( silk_LSHIFT( silk_SQRT_APPROX( speech_nrg ), 6 ), SNR_Q7 );
    223            }
    224            input_tilt = silk_SMLAWB( input_tilt, tiltWeights[ b ], SNR_Q7 );
    225        } else {
    226            NrgToNoiseRatio_Q8[ b ] = 256;
    227        }
    228    }
    229 
    230    /* Mean-of-squares */
    231    sumSquared = silk_DIV32_16( sumSquared, VAD_N_BANDS ); /* Q14 */
    232 
    233    /* Root-mean-square approximation, scale to dBs, and write to output pointer */
    234    pSNR_dB_Q7 = (opus_int16)( 3 * silk_SQRT_APPROX( sumSquared ) ); /* Q7 */
    235 
    236    /*********************************/
    237    /* Speech Probability Estimation */
    238    /*********************************/
    239    SA_Q15 = silk_sigm_Q15( silk_SMULWB( VAD_SNR_FACTOR_Q16, pSNR_dB_Q7 ) - VAD_NEGATIVE_OFFSET_Q5 );
    240 
    241    /**************************/
    242    /* Frequency Tilt Measure */
    243    /**************************/
    244    psEncC->input_tilt_Q15 = silk_LSHIFT( silk_sigm_Q15( input_tilt ) - 16384, 1 );
    245 
    246    /**************************************************/
    247    /* Scale the sigmoid output based on power levels */
    248    /**************************************************/
    249    speech_nrg = 0;
    250    for( b = 0; b < VAD_N_BANDS; b++ ) {
    251        /* Accumulate signal-without-noise energies, higher frequency bands have more weight */
    252        speech_nrg += ( b + 1 ) * silk_RSHIFT( Xnrg[ b ] - psSilk_VAD->NL[ b ], 4 );
    253    }
    254 
    255    if( psEncC->frame_length == 20 * psEncC->fs_kHz ) {
    256        speech_nrg = silk_RSHIFT32( speech_nrg, 1 );
    257    }
    258    /* Power scaling */
    259    if( speech_nrg <= 0 ) {
    260        SA_Q15 = silk_RSHIFT( SA_Q15, 1 );
    261    } else if( speech_nrg < 16384 ) {
    262        speech_nrg = silk_LSHIFT32( speech_nrg, 16 );
    263 
    264        /* square-root */
    265        speech_nrg = silk_SQRT_APPROX( speech_nrg );
    266        SA_Q15 = silk_SMULWB( 32768 + speech_nrg, SA_Q15 );
    267    }
    268 
    269    /* Copy the resulting speech activity in Q8 */
    270    psEncC->speech_activity_Q8 = silk_min_int( silk_RSHIFT( SA_Q15, 7 ), silk_uint8_MAX );
    271 
    272    /***********************************/
    273    /* Energy Level and SNR estimation */
    274    /***********************************/
    275    /* Smoothing coefficient */
    276    smooth_coef_Q16 = silk_SMULWB( VAD_SNR_SMOOTH_COEF_Q18, silk_SMULWB( (opus_int32)SA_Q15, SA_Q15 ) );
    277 
    278    if( psEncC->frame_length == 10 * psEncC->fs_kHz ) {
    279        smooth_coef_Q16 >>= 1;
    280    }
    281 
    282    for( b = 0; b < VAD_N_BANDS; b++ ) {
    283        /* compute smoothed energy-to-noise ratio per band */
    284        psSilk_VAD->NrgRatioSmth_Q8[ b ] = silk_SMLAWB( psSilk_VAD->NrgRatioSmth_Q8[ b ],
    285            NrgToNoiseRatio_Q8[ b ] - psSilk_VAD->NrgRatioSmth_Q8[ b ], smooth_coef_Q16 );
    286 
    287        /* signal to noise ratio in dB per band */
    288        SNR_Q7 = 3 * ( silk_lin2log( psSilk_VAD->NrgRatioSmth_Q8[b] ) - 8 * 128 );
    289        /* quality = sigmoid( 0.25 * ( SNR_dB - 16 ) ); */
    290        psEncC->input_quality_bands_Q15[ b ] = silk_sigm_Q15( silk_RSHIFT( SNR_Q7 - 16 * 128, 4 ) );
    291    }
    292 
    293    RESTORE_STACK;
    294    return( ret );
    295 }
    296 
    297 /**************************/
    298 /* Noise level estimation */
    299 /**************************/
    300 # if  !defined(OPUS_X86_MAY_HAVE_SSE4_1)
    301 static OPUS_INLINE
    302 #endif
    303 void silk_VAD_GetNoiseLevels(
    304    const opus_int32            pX[ VAD_N_BANDS ],  /* I    subband energies                            */
    305    silk_VAD_state              *psSilk_VAD         /* I/O  Pointer to Silk VAD state                   */
    306 )
    307 {
    308    opus_int   k;
    309    opus_int32 nl, nrg, inv_nrg;
    310    opus_int   coef, min_coef;
    311 
    312    /* Initially faster smoothing */
    313    if( psSilk_VAD->counter < 1000 ) { /* 1000 = 20 sec */
    314        min_coef = silk_DIV32_16( silk_int16_MAX, silk_RSHIFT( psSilk_VAD->counter, 4 ) + 1 );
    315        /* Increment frame counter */
    316        psSilk_VAD->counter++;
    317    } else {
    318        min_coef = 0;
    319    }
    320 
    321    for( k = 0; k < VAD_N_BANDS; k++ ) {
    322        /* Get old noise level estimate for current band */
    323        nl = psSilk_VAD->NL[ k ];
    324        silk_assert( nl >= 0 );
    325 
    326        /* Add bias */
    327        nrg = silk_ADD_POS_SAT32( pX[ k ], psSilk_VAD->NoiseLevelBias[ k ] );
    328        silk_assert( nrg > 0 );
    329 
    330        /* Invert energies */
    331        inv_nrg = silk_DIV32( silk_int32_MAX, nrg );
    332        silk_assert( inv_nrg >= 0 );
    333 
    334        /* Less update when subband energy is high */
    335        if( nrg > silk_LSHIFT( nl, 3 ) ) {
    336            coef = VAD_NOISE_LEVEL_SMOOTH_COEF_Q16 >> 3;
    337        } else if( nrg < nl ) {
    338            coef = VAD_NOISE_LEVEL_SMOOTH_COEF_Q16;
    339        } else {
    340            coef = silk_SMULWB( silk_SMULWW( inv_nrg, nl ), VAD_NOISE_LEVEL_SMOOTH_COEF_Q16 << 1 );
    341        }
    342 
    343        /* Initially faster smoothing */
    344        coef = silk_max_int( coef, min_coef );
    345 
    346        /* Smooth inverse energies */
    347        psSilk_VAD->inv_NL[ k ] = silk_SMLAWB( psSilk_VAD->inv_NL[ k ], inv_nrg - psSilk_VAD->inv_NL[ k ], coef );
    348        silk_assert( psSilk_VAD->inv_NL[ k ] >= 0 );
    349 
    350        /* Compute noise level by inverting again */
    351        nl = silk_DIV32( silk_int32_MAX, psSilk_VAD->inv_NL[ k ] );
    352        silk_assert( nl >= 0 );
    353 
    354        /* Limit noise levels (guarantee 7 bits of head room) */
    355        nl = silk_min( nl, 0x00FFFFFF );
    356 
    357        /* Store as part of state */
    358        psSilk_VAD->NL[ k ] = nl;
    359    }
    360 }