analysis.c (36284B)
1 /* Copyright (c) 2011 Xiph.Org Foundation 2 Written by Jean-Marc Valin */ 3 /* 4 Redistribution and use in source and binary forms, with or without 5 modification, are permitted provided that the following conditions 6 are met: 7 8 - Redistributions of source code must retain the above copyright 9 notice, this list of conditions and the following disclaimer. 10 11 - Redistributions in binary form must reproduce the above copyright 12 notice, this list of conditions and the following disclaimer in the 13 documentation and/or other materials provided with the distribution. 14 15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 16 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR 19 CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 23 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 24 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28 #ifdef HAVE_CONFIG_H 29 #include "config.h" 30 #endif 31 32 #define ANALYSIS_C 33 34 #ifdef MLP_TRAINING 35 #include <stdio.h> 36 #endif 37 38 #include "mathops.h" 39 #include "kiss_fft.h" 40 #include "celt.h" 41 #include "modes.h" 42 #include "arch.h" 43 #include "quant_bands.h" 44 #include "analysis.h" 45 #include "mlp.h" 46 #include "stack_alloc.h" 47 #include "float_cast.h" 48 49 #ifndef M_PI 50 #define M_PI 3.141592653 51 #endif 52 53 #ifndef DISABLE_FLOAT_API 54 55 #define TRANSITION_PENALTY 10 56 57 static const float dct_table[128] = { 58 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 59 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 0.250000f, 60 0.351851f, 0.338330f, 0.311806f, 0.273300f, 0.224292f, 0.166664f, 0.102631f, 0.034654f, 61 -0.034654f,-0.102631f,-0.166664f,-0.224292f,-0.273300f,-0.311806f,-0.338330f,-0.351851f, 62 0.346760f, 0.293969f, 0.196424f, 0.068975f,-0.068975f,-0.196424f,-0.293969f,-0.346760f, 63 -0.346760f,-0.293969f,-0.196424f,-0.068975f, 0.068975f, 0.196424f, 0.293969f, 0.346760f, 64 0.338330f, 0.224292f, 0.034654f,-0.166664f,-0.311806f,-0.351851f,-0.273300f,-0.102631f, 65 0.102631f, 0.273300f, 0.351851f, 0.311806f, 0.166664f,-0.034654f,-0.224292f,-0.338330f, 66 0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f, 67 0.326641f, 0.135299f,-0.135299f,-0.326641f,-0.326641f,-0.135299f, 0.135299f, 0.326641f, 68 0.311806f, 0.034654f,-0.273300f,-0.338330f,-0.102631f, 0.224292f, 0.351851f, 0.166664f, 69 -0.166664f,-0.351851f,-0.224292f, 0.102631f, 0.338330f, 0.273300f,-0.034654f,-0.311806f, 70 0.293969f,-0.068975f,-0.346760f,-0.196424f, 0.196424f, 0.346760f, 0.068975f,-0.293969f, 71 -0.293969f, 0.068975f, 0.346760f, 0.196424f,-0.196424f,-0.346760f,-0.068975f, 0.293969f, 72 0.273300f,-0.166664f,-0.338330f, 0.034654f, 0.351851f, 0.102631f,-0.311806f,-0.224292f, 73 0.224292f, 0.311806f,-0.102631f,-0.351851f,-0.034654f, 0.338330f, 0.166664f,-0.273300f, 74 }; 75 76 static const float analysis_window[240] = { 77 0.000043f, 0.000171f, 0.000385f, 0.000685f, 0.001071f, 0.001541f, 0.002098f, 0.002739f, 78 0.003466f, 0.004278f, 0.005174f, 0.006156f, 0.007222f, 0.008373f, 0.009607f, 0.010926f, 79 0.012329f, 0.013815f, 0.015385f, 0.017037f, 0.018772f, 0.020590f, 0.022490f, 0.024472f, 80 0.026535f, 0.028679f, 0.030904f, 0.033210f, 0.035595f, 0.038060f, 0.040604f, 0.043227f, 81 0.045928f, 0.048707f, 0.051564f, 0.054497f, 0.057506f, 0.060591f, 0.063752f, 0.066987f, 82 0.070297f, 0.073680f, 0.077136f, 0.080665f, 0.084265f, 0.087937f, 0.091679f, 0.095492f, 83 0.099373f, 0.103323f, 0.107342f, 0.111427f, 0.115579f, 0.119797f, 0.124080f, 0.128428f, 84 0.132839f, 0.137313f, 0.141849f, 0.146447f, 0.151105f, 0.155823f, 0.160600f, 0.165435f, 85 0.170327f, 0.175276f, 0.180280f, 0.185340f, 0.190453f, 0.195619f, 0.200838f, 0.206107f, 86 0.211427f, 0.216797f, 0.222215f, 0.227680f, 0.233193f, 0.238751f, 0.244353f, 0.250000f, 87 0.255689f, 0.261421f, 0.267193f, 0.273005f, 0.278856f, 0.284744f, 0.290670f, 0.296632f, 88 0.302628f, 0.308658f, 0.314721f, 0.320816f, 0.326941f, 0.333097f, 0.339280f, 0.345492f, 89 0.351729f, 0.357992f, 0.364280f, 0.370590f, 0.376923f, 0.383277f, 0.389651f, 0.396044f, 90 0.402455f, 0.408882f, 0.415325f, 0.421783f, 0.428254f, 0.434737f, 0.441231f, 0.447736f, 91 0.454249f, 0.460770f, 0.467298f, 0.473832f, 0.480370f, 0.486912f, 0.493455f, 0.500000f, 92 0.506545f, 0.513088f, 0.519630f, 0.526168f, 0.532702f, 0.539230f, 0.545751f, 0.552264f, 93 0.558769f, 0.565263f, 0.571746f, 0.578217f, 0.584675f, 0.591118f, 0.597545f, 0.603956f, 94 0.610349f, 0.616723f, 0.623077f, 0.629410f, 0.635720f, 0.642008f, 0.648271f, 0.654508f, 95 0.660720f, 0.666903f, 0.673059f, 0.679184f, 0.685279f, 0.691342f, 0.697372f, 0.703368f, 96 0.709330f, 0.715256f, 0.721144f, 0.726995f, 0.732807f, 0.738579f, 0.744311f, 0.750000f, 97 0.755647f, 0.761249f, 0.766807f, 0.772320f, 0.777785f, 0.783203f, 0.788573f, 0.793893f, 98 0.799162f, 0.804381f, 0.809547f, 0.814660f, 0.819720f, 0.824724f, 0.829673f, 0.834565f, 99 0.839400f, 0.844177f, 0.848895f, 0.853553f, 0.858151f, 0.862687f, 0.867161f, 0.871572f, 100 0.875920f, 0.880203f, 0.884421f, 0.888573f, 0.892658f, 0.896677f, 0.900627f, 0.904508f, 101 0.908321f, 0.912063f, 0.915735f, 0.919335f, 0.922864f, 0.926320f, 0.929703f, 0.933013f, 102 0.936248f, 0.939409f, 0.942494f, 0.945503f, 0.948436f, 0.951293f, 0.954072f, 0.956773f, 103 0.959396f, 0.961940f, 0.964405f, 0.966790f, 0.969096f, 0.971321f, 0.973465f, 0.975528f, 104 0.977510f, 0.979410f, 0.981228f, 0.982963f, 0.984615f, 0.986185f, 0.987671f, 0.989074f, 105 0.990393f, 0.991627f, 0.992778f, 0.993844f, 0.994826f, 0.995722f, 0.996534f, 0.997261f, 106 0.997902f, 0.998459f, 0.998929f, 0.999315f, 0.999615f, 0.999829f, 0.999957f, 1.000000f, 107 }; 108 109 static const int tbands[NB_TBANDS+1] = { 110 4, 8, 12, 16, 20, 24, 28, 32, 40, 48, 56, 64, 80, 96, 112, 136, 160, 192, 240 111 }; 112 113 #define NB_TONAL_SKIP_BANDS 9 114 115 static opus_val32 silk_resampler_down2_hp( 116 opus_val32 *S, /* I/O State vector [ 2 ] */ 117 opus_val32 *out, /* O Output signal [ floor(len/2) ] */ 118 const opus_val32 *in, /* I Input signal [ len ] */ 119 int inLen /* I Number of input samples */ 120 ) 121 { 122 int k, len2 = inLen/2; 123 opus_val32 in32, out32, out32_hp, Y, X; 124 opus_val64 hp_ener = 0; 125 /* Internal variables and state are in Q10 format */ 126 for( k = 0; k < len2; k++ ) { 127 /* Convert to Q10 */ 128 in32 = in[ 2 * k ]; 129 130 /* All-pass section for even input sample */ 131 Y = SUB32( in32, S[ 0 ] ); 132 X = MULT16_32_Q15(QCONST16(0.6074371f, 15), Y); 133 out32 = ADD32( S[ 0 ], X ); 134 S[ 0 ] = ADD32( in32, X ); 135 out32_hp = out32; 136 /* Convert to Q10 */ 137 in32 = in[ 2 * k + 1 ]; 138 139 /* All-pass section for odd input sample, and add to output of previous section */ 140 Y = SUB32( in32, S[ 1 ] ); 141 X = MULT16_32_Q15(QCONST16(0.15063f, 15), Y); 142 out32 = ADD32( out32, S[ 1 ] ); 143 out32 = ADD32( out32, X ); 144 S[ 1 ] = ADD32( in32, X ); 145 146 Y = SUB32( -in32, S[ 2 ] ); 147 X = MULT16_32_Q15(QCONST16(0.15063f, 15), Y); 148 out32_hp = ADD32( out32_hp, S[ 2 ] ); 149 out32_hp = ADD32( out32_hp, X ); 150 S[ 2 ] = ADD32( -in32, X ); 151 152 /* len2 can be up to 480, so we shift by 8 to make it fit. */ 153 hp_ener += SHR64(out32_hp*(opus_val64)out32_hp, 8); 154 /* Add, convert back to int16 and store to output */ 155 out[ k ] = HALF32(out32); 156 } 157 #ifdef FIXED_POINT 158 /* Fitting in 32 bits. */ 159 hp_ener = hp_ener >> (2*SIG_SHIFT); 160 if (hp_ener > 2147483647) hp_ener = 2147483647; 161 #endif 162 return (opus_val32)hp_ener; 163 } 164 165 static opus_val32 downmix_and_resample(downmix_func downmix, const void *_x, opus_val32 *y, opus_val32 S[3], int subframe, int offset, int c1, int c2, int C, int Fs) 166 { 167 VARDECL(opus_val32, tmp); 168 int j; 169 opus_val32 ret = 0; 170 SAVE_STACK; 171 172 if (subframe==0) return 0; 173 if (Fs == 48000) 174 { 175 subframe *= 2; 176 offset *= 2; 177 } else if (Fs == 16000) { 178 subframe = subframe*2/3; 179 offset = offset*2/3; 180 } 181 else if (Fs != 24000) celt_assert(0); 182 ALLOC(tmp, subframe, opus_val32); 183 184 downmix(_x, tmp, subframe, offset, c1, c2, C); 185 if ((c2==-2 && C==2) || c2>-1) { 186 for (j=0;j<subframe;j++) { 187 tmp[j] = HALF32(tmp[j]); 188 } 189 } 190 if (Fs == 48000) 191 { 192 ret = silk_resampler_down2_hp(S, y, tmp, subframe); 193 } else if (Fs == 24000) { 194 OPUS_COPY(y, tmp, subframe); 195 } else if (Fs == 16000) { 196 VARDECL(opus_val32, tmp3x); 197 ALLOC(tmp3x, 3*subframe, opus_val32); 198 /* Don't do this at home! This resampler is horrible and it's only (barely) 199 usable for the purpose of the analysis because we don't care about all 200 the aliasing between 8 kHz and 12 kHz. */ 201 for (j=0;j<subframe;j++) 202 { 203 tmp3x[3*j] = tmp[j]; 204 tmp3x[3*j+1] = tmp[j]; 205 tmp3x[3*j+2] = tmp[j]; 206 } 207 silk_resampler_down2_hp(S, y, tmp3x, 3*subframe); 208 } 209 RESTORE_STACK; 210 #ifndef FIXED_POINT 211 ret *= 1.f/32768/32768; 212 #endif 213 return ret; 214 } 215 216 void tonality_analysis_init(TonalityAnalysisState *tonal, opus_int32 Fs) 217 { 218 /* Initialize reusable fields. */ 219 tonal->arch = opus_select_arch(); 220 tonal->Fs = Fs; 221 /* Clear remaining fields. */ 222 tonality_analysis_reset(tonal); 223 } 224 225 void tonality_analysis_reset(TonalityAnalysisState *tonal) 226 { 227 /* Clear non-reusable fields. */ 228 char *start = (char*)&tonal->TONALITY_ANALYSIS_RESET_START; 229 OPUS_CLEAR(start, sizeof(TonalityAnalysisState) - (start - (char*)tonal)); 230 } 231 232 void tonality_get_info(TonalityAnalysisState *tonal, AnalysisInfo *info_out, int len) 233 { 234 int pos; 235 int curr_lookahead; 236 float tonality_max; 237 float tonality_avg; 238 int tonality_count; 239 int i; 240 int pos0; 241 float prob_avg; 242 float prob_count; 243 float prob_min, prob_max; 244 float vad_prob; 245 int mpos, vpos; 246 int bandwidth_span; 247 248 pos = tonal->read_pos; 249 curr_lookahead = tonal->write_pos-tonal->read_pos; 250 if (curr_lookahead<0) 251 curr_lookahead += DETECT_SIZE; 252 253 tonal->read_subframe += len/(tonal->Fs/400); 254 while (tonal->read_subframe>=8) 255 { 256 tonal->read_subframe -= 8; 257 tonal->read_pos++; 258 } 259 if (tonal->read_pos>=DETECT_SIZE) 260 tonal->read_pos-=DETECT_SIZE; 261 262 /* On long frames, look at the second analysis window rather than the first. */ 263 if (len > tonal->Fs/50 && pos != tonal->write_pos) 264 { 265 pos++; 266 if (pos==DETECT_SIZE) 267 pos=0; 268 } 269 if (pos == tonal->write_pos) 270 pos--; 271 if (pos<0) 272 pos = DETECT_SIZE-1; 273 pos0 = pos; 274 OPUS_COPY(info_out, &tonal->info[pos], 1); 275 if (!info_out->valid) 276 return; 277 tonality_max = tonality_avg = info_out->tonality; 278 tonality_count = 1; 279 /* Look at the neighbouring frames and pick largest bandwidth found (to be safe). */ 280 bandwidth_span = 6; 281 /* If possible, look ahead for a tone to compensate for the delay in the tone detector. */ 282 for (i=0;i<3;i++) 283 { 284 pos++; 285 if (pos==DETECT_SIZE) 286 pos = 0; 287 if (pos == tonal->write_pos) 288 break; 289 tonality_max = MAX32(tonality_max, tonal->info[pos].tonality); 290 tonality_avg += tonal->info[pos].tonality; 291 tonality_count++; 292 info_out->bandwidth = IMAX(info_out->bandwidth, tonal->info[pos].bandwidth); 293 bandwidth_span--; 294 } 295 pos = pos0; 296 /* Look back in time to see if any has a wider bandwidth than the current frame. */ 297 for (i=0;i<bandwidth_span;i++) 298 { 299 pos--; 300 if (pos < 0) 301 pos = DETECT_SIZE-1; 302 if (pos == tonal->write_pos) 303 break; 304 info_out->bandwidth = IMAX(info_out->bandwidth, tonal->info[pos].bandwidth); 305 } 306 info_out->tonality = MAX32(tonality_avg/tonality_count, tonality_max-.2f); 307 308 mpos = vpos = pos0; 309 /* If we have enough look-ahead, compensate for the ~5-frame delay in the music prob and 310 ~1 frame delay in the VAD prob. */ 311 if (curr_lookahead > 15) 312 { 313 mpos += 5; 314 if (mpos>=DETECT_SIZE) 315 mpos -= DETECT_SIZE; 316 vpos += 1; 317 if (vpos>=DETECT_SIZE) 318 vpos -= DETECT_SIZE; 319 } 320 321 /* The following calculations attempt to minimize a "badness function" 322 for the transition. When switching from speech to music, the badness 323 of switching at frame k is 324 b_k = S*v_k + \sum_{i=0}^{k-1} v_i*(p_i - T) 325 where 326 v_i is the activity probability (VAD) at frame i, 327 p_i is the music probability at frame i 328 T is the probability threshold for switching 329 S is the penalty for switching during active audio rather than silence 330 the current frame has index i=0 331 332 Rather than apply badness to directly decide when to switch, what we compute 333 instead is the threshold for which the optimal switching point is now. When 334 considering whether to switch now (frame 0) or at frame k, we have: 335 S*v_0 = S*v_k + \sum_{i=0}^{k-1} v_i*(p_i - T) 336 which gives us: 337 T = ( \sum_{i=0}^{k-1} v_i*p_i + S*(v_k-v_0) ) / ( \sum_{i=0}^{k-1} v_i ) 338 We take the min threshold across all positive values of k (up to the maximum 339 amount of lookahead we have) to give us the threshold for which the current 340 frame is the optimal switch point. 341 342 The last step is that we need to consider whether we want to switch at all. 343 For that we use the average of the music probability over the entire window. 344 If the threshold is higher than that average we're not going to 345 switch, so we compute a min with the average as well. The result of all these 346 min operations is music_prob_min, which gives the threshold for switching to music 347 if we're currently encoding for speech. 348 349 We do the exact opposite to compute music_prob_max which is used for switching 350 from music to speech. 351 */ 352 prob_min = 1.f; 353 prob_max = 0.f; 354 vad_prob = tonal->info[vpos].activity_probability; 355 prob_count = MAX16(.1f, vad_prob); 356 prob_avg = MAX16(.1f, vad_prob)*tonal->info[mpos].music_prob; 357 while (1) 358 { 359 float pos_vad; 360 mpos++; 361 if (mpos==DETECT_SIZE) 362 mpos = 0; 363 if (mpos == tonal->write_pos) 364 break; 365 vpos++; 366 if (vpos==DETECT_SIZE) 367 vpos = 0; 368 if (vpos == tonal->write_pos) 369 break; 370 pos_vad = tonal->info[vpos].activity_probability; 371 prob_min = MIN16((prob_avg - TRANSITION_PENALTY*(vad_prob - pos_vad))/prob_count, prob_min); 372 prob_max = MAX16((prob_avg + TRANSITION_PENALTY*(vad_prob - pos_vad))/prob_count, prob_max); 373 prob_count += MAX16(.1f, pos_vad); 374 prob_avg += MAX16(.1f, pos_vad)*tonal->info[mpos].music_prob; 375 } 376 info_out->music_prob = prob_avg/prob_count; 377 prob_min = MIN16(prob_avg/prob_count, prob_min); 378 prob_max = MAX16(prob_avg/prob_count, prob_max); 379 prob_min = MAX16(prob_min, 0.f); 380 prob_max = MIN16(prob_max, 1.f); 381 382 /* If we don't have enough look-ahead, do our best to make a decent decision. */ 383 if (curr_lookahead < 10) 384 { 385 float pmin, pmax; 386 pmin = prob_min; 387 pmax = prob_max; 388 pos = pos0; 389 /* Look for min/max in the past. */ 390 for (i=0;i<IMIN(tonal->count-1, 15);i++) 391 { 392 pos--; 393 if (pos < 0) 394 pos = DETECT_SIZE-1; 395 pmin = MIN16(pmin, tonal->info[pos].music_prob); 396 pmax = MAX16(pmax, tonal->info[pos].music_prob); 397 } 398 /* Bias against switching on active audio. */ 399 pmin = MAX16(0.f, pmin - .1f*vad_prob); 400 pmax = MIN16(1.f, pmax + .1f*vad_prob); 401 prob_min += (1.f-.1f*curr_lookahead)*(pmin - prob_min); 402 prob_max += (1.f-.1f*curr_lookahead)*(pmax - prob_max); 403 } 404 info_out->music_prob_min = prob_min; 405 info_out->music_prob_max = prob_max; 406 407 /* printf("%f %f %f %f %f\n", prob_min, prob_max, prob_avg/prob_count, vad_prob, info_out->music_prob); */ 408 } 409 410 static const float std_feature_bias[9] = { 411 5.684947f, 3.475288f, 1.770634f, 1.599784f, 3.773215f, 412 2.163313f, 1.260756f, 1.116868f, 1.918795f 413 }; 414 415 #define LEAKAGE_OFFSET 2.5f 416 #define LEAKAGE_SLOPE 2.f 417 418 #ifdef FIXED_POINT 419 /* For fixed-point, the input is +/-2^15 shifted up by SIG_SHIFT, so we need to 420 compensate for that in the energy. */ 421 #define SCALE_COMPENS (1.f/((opus_int32)1<<(15+SIG_SHIFT))) 422 #define SCALE_ENER(e) ((SCALE_COMPENS*SCALE_COMPENS)*(e)) 423 #else 424 #define SCALE_ENER(e) ((1.f/32768/32768)*e) 425 #endif 426 427 #ifdef FIXED_POINT 428 static int is_digital_silence32(const opus_val32* pcm, int frame_size, int channels, int lsb_depth) 429 { 430 int silence = 0; 431 opus_val32 sample_max = 0; 432 #ifdef MLP_TRAINING 433 return 0; 434 #endif 435 sample_max = celt_maxabs32(pcm, frame_size*channels); 436 437 silence = (sample_max == 0); 438 (void)lsb_depth; 439 return silence; 440 } 441 #else 442 #define is_digital_silence32(pcm, frame_size, channels, lsb_depth) is_digital_silence(pcm, frame_size, channels, lsb_depth) 443 #endif 444 445 static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt_mode, const void *x, int len, int offset, int c1, int c2, int C, int lsb_depth, downmix_func downmix) 446 { 447 int i, b; 448 const kiss_fft_state *kfft; 449 VARDECL(kiss_fft_cpx, in); 450 VARDECL(kiss_fft_cpx, out); 451 int N = 480, N2=240; 452 float * OPUS_RESTRICT A = tonal->angle; 453 float * OPUS_RESTRICT dA = tonal->d_angle; 454 float * OPUS_RESTRICT d2A = tonal->d2_angle; 455 VARDECL(float, tonality); 456 VARDECL(float, noisiness); 457 float band_tonality[NB_TBANDS]; 458 float logE[NB_TBANDS]; 459 float BFCC[8]; 460 float features[25]; 461 float frame_tonality; 462 float max_frame_tonality; 463 /*float tw_sum=0;*/ 464 float frame_noisiness; 465 const float pi4 = (float)(M_PI*M_PI*M_PI*M_PI); 466 float slope=0; 467 float frame_stationarity; 468 float relativeE; 469 float frame_probs[2]; 470 float alpha, alphaE, alphaE2; 471 float frame_loudness; 472 float bandwidth_mask; 473 int is_masked[NB_TBANDS+1]; 474 int bandwidth=0; 475 float maxE = 0; 476 float noise_floor; 477 int remaining; 478 AnalysisInfo *info; 479 float hp_ener; 480 float tonality2[240]; 481 float midE[8]; 482 float spec_variability=0; 483 float band_log2[NB_TBANDS+1]; 484 float leakage_from[NB_TBANDS+1]; 485 float leakage_to[NB_TBANDS+1]; 486 float layer_out[MAX_NEURONS]; 487 float below_max_pitch; 488 float above_max_pitch; 489 int is_silence; 490 SAVE_STACK; 491 492 if (!tonal->initialized) 493 { 494 tonal->mem_fill = 240; 495 tonal->initialized = 1; 496 } 497 alpha = 1.f/IMIN(10, 1+tonal->count); 498 alphaE = 1.f/IMIN(25, 1+tonal->count); 499 /* Noise floor related decay for bandwidth detection: -2.2 dB/second */ 500 alphaE2 = 1.f/IMIN(100, 1+tonal->count); 501 if (tonal->count <= 1) alphaE2 = 1; 502 503 if (tonal->Fs == 48000) 504 { 505 /* len and offset are now at 24 kHz. */ 506 len/= 2; 507 offset /= 2; 508 } else if (tonal->Fs == 16000) { 509 len = 3*len/2; 510 offset = 3*offset/2; 511 } 512 513 kfft = celt_mode->mdct.kfft[0]; 514 tonal->hp_ener_accum += (float)downmix_and_resample(downmix, x, 515 &tonal->inmem[tonal->mem_fill], tonal->downmix_state, 516 IMIN(len, ANALYSIS_BUF_SIZE-tonal->mem_fill), offset, c1, c2, C, tonal->Fs); 517 if (tonal->mem_fill+len < ANALYSIS_BUF_SIZE) 518 { 519 tonal->mem_fill += len; 520 /* Don't have enough to update the analysis */ 521 RESTORE_STACK; 522 return; 523 } 524 hp_ener = tonal->hp_ener_accum; 525 info = &tonal->info[tonal->write_pos++]; 526 if (tonal->write_pos>=DETECT_SIZE) 527 tonal->write_pos-=DETECT_SIZE; 528 529 is_silence = is_digital_silence32(tonal->inmem, ANALYSIS_BUF_SIZE, 1, lsb_depth); 530 531 ALLOC(in, 480, kiss_fft_cpx); 532 ALLOC(out, 480, kiss_fft_cpx); 533 ALLOC(tonality, 240, float); 534 ALLOC(noisiness, 240, float); 535 for (i=0;i<N2;i++) 536 { 537 float w = analysis_window[i]; 538 in[i].r = (kiss_fft_scalar)(w*tonal->inmem[i]); 539 in[i].i = (kiss_fft_scalar)(w*tonal->inmem[N2+i]); 540 in[N-i-1].r = (kiss_fft_scalar)(w*tonal->inmem[N-i-1]); 541 in[N-i-1].i = (kiss_fft_scalar)(w*tonal->inmem[N+N2-i-1]); 542 } 543 OPUS_MOVE(tonal->inmem, tonal->inmem+ANALYSIS_BUF_SIZE-240, 240); 544 remaining = len - (ANALYSIS_BUF_SIZE-tonal->mem_fill); 545 tonal->hp_ener_accum = (float)downmix_and_resample(downmix, x, 546 &tonal->inmem[240], tonal->downmix_state, remaining, 547 offset+ANALYSIS_BUF_SIZE-tonal->mem_fill, c1, c2, C, tonal->Fs); 548 tonal->mem_fill = 240 + remaining; 549 if (is_silence) 550 { 551 /* On silence, copy the previous analysis. */ 552 int prev_pos = tonal->write_pos-2; 553 if (prev_pos < 0) 554 prev_pos += DETECT_SIZE; 555 OPUS_COPY(info, &tonal->info[prev_pos], 1); 556 RESTORE_STACK; 557 return; 558 } 559 opus_fft(kfft, in, out, tonal->arch); 560 #ifndef FIXED_POINT 561 /* If there's any NaN on the input, the entire output will be NaN, so we only need to check one value. */ 562 if (celt_isnan(out[0].r)) 563 { 564 info->valid = 0; 565 RESTORE_STACK; 566 return; 567 } 568 #endif 569 570 for (i=1;i<N2;i++) 571 { 572 float X1r, X2r, X1i, X2i; 573 float angle, d_angle, d2_angle; 574 float angle2, d_angle2, d2_angle2; 575 float mod1, mod2, avg_mod; 576 X1r = (float)out[i].r+out[N-i].r; 577 X1i = (float)out[i].i-out[N-i].i; 578 X2r = (float)out[i].i+out[N-i].i; 579 X2i = (float)out[N-i].r-out[i].r; 580 581 angle = (float)(.5f/M_PI)*fast_atan2f(X1i, X1r); 582 d_angle = angle - A[i]; 583 d2_angle = d_angle - dA[i]; 584 585 angle2 = (float)(.5f/M_PI)*fast_atan2f(X2i, X2r); 586 d_angle2 = angle2 - angle; 587 d2_angle2 = d_angle2 - d_angle; 588 589 mod1 = d2_angle - (float)float2int(d2_angle); 590 noisiness[i] = ABS16(mod1); 591 mod1 *= mod1; 592 mod1 *= mod1; 593 594 mod2 = d2_angle2 - (float)float2int(d2_angle2); 595 noisiness[i] += ABS16(mod2); 596 mod2 *= mod2; 597 mod2 *= mod2; 598 599 avg_mod = .25f*(d2A[i]+mod1+2*mod2); 600 /* This introduces an extra delay of 2 frames in the detection. */ 601 tonality[i] = 1.f/(1.f+40.f*16.f*pi4*avg_mod)-.015f; 602 /* No delay on this detection, but it's less reliable. */ 603 tonality2[i] = 1.f/(1.f+40.f*16.f*pi4*mod2)-.015f; 604 605 A[i] = angle2; 606 dA[i] = d_angle2; 607 d2A[i] = mod2; 608 } 609 for (i=2;i<N2-1;i++) 610 { 611 float tt = MIN32(tonality2[i], MAX32(tonality2[i-1], tonality2[i+1])); 612 tonality[i] = .9f*MAX32(tonality[i], tt-.1f); 613 } 614 frame_tonality = 0; 615 max_frame_tonality = 0; 616 /*tw_sum = 0;*/ 617 info->activity = 0; 618 frame_noisiness = 0; 619 frame_stationarity = 0; 620 if (!tonal->count) 621 { 622 for (b=0;b<NB_TBANDS;b++) 623 { 624 tonal->lowE[b] = 1e10; 625 tonal->highE[b] = -1e10; 626 } 627 } 628 relativeE = 0; 629 frame_loudness = 0; 630 /* The energy of the very first band is special because of DC. */ 631 { 632 float E = 0; 633 float X1r, X2r; 634 X1r = 2*(float)out[0].r; 635 X2r = 2*(float)out[0].i; 636 E = X1r*X1r + X2r*X2r; 637 for (i=1;i<4;i++) 638 { 639 float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r 640 + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i; 641 E += binE; 642 } 643 E = SCALE_ENER(E); 644 band_log2[0] = .5f*1.442695f*(float)log(E+1e-10f); 645 } 646 for (b=0;b<NB_TBANDS;b++) 647 { 648 float E=0, tE=0, nE=0; 649 float L1, L2; 650 float stationarity; 651 for (i=tbands[b];i<tbands[b+1];i++) 652 { 653 float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r 654 + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i; 655 binE = SCALE_ENER(binE); 656 E += binE; 657 tE += binE*MAX32(0, tonality[i]); 658 nE += binE*2.f*(.5f-noisiness[i]); 659 } 660 #ifndef FIXED_POINT 661 /* Check for extreme band energies that could cause NaNs later. */ 662 if (!(E<1e9f) || celt_isnan(E)) 663 { 664 info->valid = 0; 665 RESTORE_STACK; 666 return; 667 } 668 #endif 669 670 tonal->E[tonal->E_count][b] = E; 671 frame_noisiness += nE/(1e-15f+E); 672 673 frame_loudness += (float)sqrt(E+1e-10f); 674 logE[b] = (float)log(E+1e-10f); 675 band_log2[b+1] = .5f*1.442695f*(float)log(E+1e-10f); 676 tonal->logE[tonal->E_count][b] = logE[b]; 677 if (tonal->count==0) 678 tonal->highE[b] = tonal->lowE[b] = logE[b]; 679 if (tonal->highE[b] > tonal->lowE[b] + 7.5) 680 { 681 if (tonal->highE[b] - logE[b] > logE[b] - tonal->lowE[b]) 682 tonal->highE[b] -= .01f; 683 else 684 tonal->lowE[b] += .01f; 685 } 686 if (logE[b] > tonal->highE[b]) 687 { 688 tonal->highE[b] = logE[b]; 689 tonal->lowE[b] = MAX32(tonal->highE[b]-15, tonal->lowE[b]); 690 } else if (logE[b] < tonal->lowE[b]) 691 { 692 tonal->lowE[b] = logE[b]; 693 tonal->highE[b] = MIN32(tonal->lowE[b]+15, tonal->highE[b]); 694 } 695 relativeE += (logE[b]-tonal->lowE[b])/(1e-5f + (tonal->highE[b]-tonal->lowE[b])); 696 697 L1=L2=0; 698 for (i=0;i<NB_FRAMES;i++) 699 { 700 L1 += (float)sqrt(tonal->E[i][b]); 701 L2 += tonal->E[i][b]; 702 } 703 704 stationarity = MIN16(0.99f,L1/(float)sqrt(1e-15+NB_FRAMES*L2)); 705 stationarity *= stationarity; 706 stationarity *= stationarity; 707 frame_stationarity += stationarity; 708 /*band_tonality[b] = tE/(1e-15+E)*/; 709 band_tonality[b] = MAX16(tE/(1e-15f+E), stationarity*tonal->prev_band_tonality[b]); 710 #if 0 711 if (b>=NB_TONAL_SKIP_BANDS) 712 { 713 frame_tonality += tweight[b]*band_tonality[b]; 714 tw_sum += tweight[b]; 715 } 716 #else 717 frame_tonality += band_tonality[b]; 718 if (b>=NB_TBANDS-NB_TONAL_SKIP_BANDS) 719 frame_tonality -= band_tonality[b-NB_TBANDS+NB_TONAL_SKIP_BANDS]; 720 #endif 721 max_frame_tonality = MAX16(max_frame_tonality, (1.f+.03f*(b-NB_TBANDS))*frame_tonality); 722 slope += band_tonality[b]*(b-8); 723 /*printf("%f %f ", band_tonality[b], stationarity);*/ 724 tonal->prev_band_tonality[b] = band_tonality[b]; 725 } 726 727 leakage_from[0] = band_log2[0]; 728 leakage_to[0] = band_log2[0] - LEAKAGE_OFFSET; 729 for (b=1;b<NB_TBANDS+1;b++) 730 { 731 float leak_slope = LEAKAGE_SLOPE*(tbands[b]-tbands[b-1])/4; 732 leakage_from[b] = MIN16(leakage_from[b-1]+leak_slope, band_log2[b]); 733 leakage_to[b] = MAX16(leakage_to[b-1]-leak_slope, band_log2[b]-LEAKAGE_OFFSET); 734 } 735 for (b=NB_TBANDS-2;b>=0;b--) 736 { 737 float leak_slope = LEAKAGE_SLOPE*(tbands[b+1]-tbands[b])/4; 738 leakage_from[b] = MIN16(leakage_from[b+1]+leak_slope, leakage_from[b]); 739 leakage_to[b] = MAX16(leakage_to[b+1]-leak_slope, leakage_to[b]); 740 } 741 celt_assert(NB_TBANDS+1 <= LEAK_BANDS); 742 for (b=0;b<NB_TBANDS+1;b++) 743 { 744 /* leak_boost[] is made up of two terms. The first, based on leakage_to[], 745 represents the boost needed to overcome the amount of analysis leakage 746 cause in a weaker band b by louder neighbouring bands. 747 The second, based on leakage_from[], applies to a loud band b for 748 which the quantization noise causes synthesis leakage to the weaker 749 neighbouring bands. */ 750 float boost = MAX16(0, leakage_to[b] - band_log2[b]) + 751 MAX16(0, band_log2[b] - (leakage_from[b]+LEAKAGE_OFFSET)); 752 info->leak_boost[b] = IMIN(255, (int)floor(.5 + 64.f*boost)); 753 } 754 for (;b<LEAK_BANDS;b++) info->leak_boost[b] = 0; 755 756 for (i=0;i<NB_FRAMES;i++) 757 { 758 int j; 759 float mindist = 1e15f; 760 for (j=0;j<NB_FRAMES;j++) 761 { 762 int k; 763 float dist=0; 764 for (k=0;k<NB_TBANDS;k++) 765 { 766 float tmp; 767 tmp = tonal->logE[i][k] - tonal->logE[j][k]; 768 dist += tmp*tmp; 769 } 770 if (j!=i) 771 mindist = MIN32(mindist, dist); 772 } 773 spec_variability += mindist; 774 } 775 spec_variability = (float)sqrt(spec_variability/NB_FRAMES/NB_TBANDS); 776 bandwidth_mask = 0; 777 bandwidth = 0; 778 maxE = 0; 779 noise_floor = 5.7e-4f/(1<<(IMAX(0,lsb_depth-8))); 780 noise_floor *= noise_floor; 781 below_max_pitch=0; 782 above_max_pitch=0; 783 for (b=0;b<NB_TBANDS;b++) 784 { 785 float E=0; 786 float Em; 787 int band_start, band_end; 788 /* Keep a margin of 300 Hz for aliasing */ 789 band_start = tbands[b]; 790 band_end = tbands[b+1]; 791 for (i=band_start;i<band_end;i++) 792 { 793 float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r 794 + out[i].i*(float)out[i].i + out[N-i].i*(float)out[N-i].i; 795 E += binE; 796 } 797 E = SCALE_ENER(E); 798 maxE = MAX32(maxE, E); 799 if (band_start < 64) 800 { 801 below_max_pitch += E; 802 } else { 803 above_max_pitch += E; 804 } 805 tonal->meanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E); 806 Em = MAX32(E, tonal->meanE[b]); 807 /* Consider the band "active" only if all these conditions are met: 808 1) less than 90 dB below the peak band (maximal masking possible considering 809 both the ATH and the loudness-dependent slope of the spreading function) 810 2) above the PCM quantization noise floor 811 We use b+1 because the first CELT band isn't included in tbands[] 812 */ 813 if (E*1e9f > maxE && (Em > 3*noise_floor*(band_end-band_start) || E > noise_floor*(band_end-band_start))) 814 bandwidth = b+1; 815 /* Check if the band is masked (see below). */ 816 is_masked[b] = E < (tonal->prev_bandwidth >= b+1 ? .01f : .05f)*bandwidth_mask; 817 /* Use a simple follower with 13 dB/Bark slope for spreading function. */ 818 bandwidth_mask = MAX32(.05f*bandwidth_mask, E); 819 } 820 /* Special case for the last two bands, for which we don't have spectrum but only 821 the energy above 12 kHz. The difficulty here is that the high-pass we use 822 leaks some LF energy, so we need to increase the threshold without accidentally cutting 823 off the band. */ 824 if (tonal->Fs == 48000) { 825 float noise_ratio; 826 float Em; 827 float E = hp_ener*(1.f/(60*60)); 828 noise_ratio = tonal->prev_bandwidth==20 ? 10.f : 30.f; 829 830 #ifdef FIXED_POINT 831 /* silk_resampler_down2_hp() shifted right by an extra 8 bits. */ 832 E *= 256.f*(1.f/Q15ONE)*(1.f/Q15ONE); 833 #endif 834 above_max_pitch += E; 835 tonal->meanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E); 836 Em = MAX32(E, tonal->meanE[b]); 837 if (Em > 3*noise_ratio*noise_floor*160 || E > noise_ratio*noise_floor*160) 838 bandwidth = 20; 839 /* Check if the band is masked (see below). */ 840 is_masked[b] = E < (tonal->prev_bandwidth == 20 ? .01f : .05f)*bandwidth_mask; 841 } 842 if (above_max_pitch > below_max_pitch) 843 info->max_pitch_ratio = below_max_pitch/above_max_pitch; 844 else 845 info->max_pitch_ratio = 1; 846 /* In some cases, resampling aliasing can create a small amount of energy in the first band 847 being cut. So if the last band is masked, we don't include it. */ 848 if (bandwidth == 20 && is_masked[NB_TBANDS]) 849 bandwidth-=2; 850 else if (bandwidth > 0 && bandwidth <= NB_TBANDS && is_masked[bandwidth-1]) 851 bandwidth--; 852 if (tonal->count<=2) 853 bandwidth = 20; 854 frame_loudness = 20*(float)log10(frame_loudness); 855 tonal->Etracker = MAX32(tonal->Etracker-.003f, frame_loudness); 856 tonal->lowECount *= (1-alphaE); 857 if (frame_loudness < tonal->Etracker-30) 858 tonal->lowECount += alphaE; 859 860 for (i=0;i<8;i++) 861 { 862 float sum=0; 863 for (b=0;b<16;b++) 864 sum += dct_table[i*16+b]*logE[b]; 865 BFCC[i] = sum; 866 } 867 for (i=0;i<8;i++) 868 { 869 float sum=0; 870 for (b=0;b<16;b++) 871 sum += dct_table[i*16+b]*.5f*(tonal->highE[b]+tonal->lowE[b]); 872 midE[i] = sum; 873 } 874 875 frame_stationarity /= NB_TBANDS; 876 relativeE /= NB_TBANDS; 877 if (tonal->count<10) 878 relativeE = .5f; 879 frame_noisiness /= NB_TBANDS; 880 #if 1 881 info->activity = frame_noisiness + (1-frame_noisiness)*relativeE; 882 #else 883 info->activity = .5*(1+frame_noisiness-frame_stationarity); 884 #endif 885 frame_tonality = (max_frame_tonality/(NB_TBANDS-NB_TONAL_SKIP_BANDS)); 886 frame_tonality = MAX16(frame_tonality, tonal->prev_tonality*.8f); 887 tonal->prev_tonality = frame_tonality; 888 889 slope /= 8*8; 890 info->tonality_slope = slope; 891 892 tonal->E_count = (tonal->E_count+1)%NB_FRAMES; 893 tonal->count = IMIN(tonal->count+1, ANALYSIS_COUNT_MAX); 894 info->tonality = frame_tonality; 895 896 for (i=0;i<4;i++) 897 features[i] = -0.12299f*(BFCC[i]+tonal->mem[i+24]) + 0.49195f*(tonal->mem[i]+tonal->mem[i+16]) + 0.69693f*tonal->mem[i+8] - 1.4349f*tonal->cmean[i]; 898 899 for (i=0;i<4;i++) 900 tonal->cmean[i] = (1-alpha)*tonal->cmean[i] + alpha*BFCC[i]; 901 902 for (i=0;i<4;i++) 903 features[4+i] = 0.63246f*(BFCC[i]-tonal->mem[i+24]) + 0.31623f*(tonal->mem[i]-tonal->mem[i+16]); 904 for (i=0;i<3;i++) 905 features[8+i] = 0.53452f*(BFCC[i]+tonal->mem[i+24]) - 0.26726f*(tonal->mem[i]+tonal->mem[i+16]) -0.53452f*tonal->mem[i+8]; 906 907 if (tonal->count > 5) 908 { 909 for (i=0;i<9;i++) 910 tonal->std[i] = (1-alpha)*tonal->std[i] + alpha*features[i]*features[i]; 911 } 912 for (i=0;i<4;i++) 913 features[i] = BFCC[i]-midE[i]; 914 915 for (i=0;i<8;i++) 916 { 917 tonal->mem[i+24] = tonal->mem[i+16]; 918 tonal->mem[i+16] = tonal->mem[i+8]; 919 tonal->mem[i+8] = tonal->mem[i]; 920 tonal->mem[i] = BFCC[i]; 921 } 922 for (i=0;i<9;i++) 923 features[11+i] = (float)sqrt(tonal->std[i]) - std_feature_bias[i]; 924 features[18] = spec_variability - 0.78f; 925 features[20] = info->tonality - 0.154723f; 926 features[21] = info->activity - 0.724643f; 927 features[22] = frame_stationarity - 0.743717f; 928 features[23] = info->tonality_slope + 0.069216f; 929 features[24] = tonal->lowECount - 0.067930f; 930 931 analysis_compute_dense(&layer0, layer_out, features); 932 analysis_compute_gru(&layer1, tonal->rnn_state, layer_out); 933 analysis_compute_dense(&layer2, frame_probs, tonal->rnn_state); 934 935 /* Probability of speech or music vs noise */ 936 info->activity_probability = frame_probs[1]; 937 info->music_prob = frame_probs[0]; 938 939 /*printf("%f %f %f\n", frame_probs[0], frame_probs[1], info->music_prob);*/ 940 #ifdef MLP_TRAINING 941 for (i=0;i<25;i++) 942 printf("%f ", features[i]); 943 printf("\n"); 944 #endif 945 946 info->bandwidth = bandwidth; 947 tonal->prev_bandwidth = bandwidth; 948 /*printf("%d %d\n", info->bandwidth, info->opus_bandwidth);*/ 949 info->noisiness = frame_noisiness; 950 info->valid = 1; 951 RESTORE_STACK; 952 } 953 954 void run_analysis(TonalityAnalysisState *analysis, const CELTMode *celt_mode, const void *analysis_pcm, 955 int analysis_frame_size, int frame_size, int c1, int c2, int C, opus_int32 Fs, 956 int lsb_depth, downmix_func downmix, AnalysisInfo *analysis_info) 957 { 958 int offset; 959 int pcm_len; 960 961 analysis_frame_size -= analysis_frame_size&1; 962 if (analysis_pcm != NULL) 963 { 964 /* Avoid overflow/wrap-around of the analysis buffer */ 965 analysis_frame_size = IMIN((DETECT_SIZE-5)*Fs/50, analysis_frame_size); 966 967 pcm_len = analysis_frame_size - analysis->analysis_offset; 968 offset = analysis->analysis_offset; 969 while (pcm_len>0) { 970 tonality_analysis(analysis, celt_mode, analysis_pcm, IMIN(Fs/50, pcm_len), offset, c1, c2, C, lsb_depth, downmix); 971 offset += Fs/50; 972 pcm_len -= Fs/50; 973 } 974 analysis->analysis_offset = analysis_frame_size; 975 976 analysis->analysis_offset -= frame_size; 977 } 978 979 tonality_get_info(analysis, analysis_info, frame_size); 980 } 981 982 #endif /* DISABLE_FLOAT_API */