vad_core.c (26200B)
1 /* 2 * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "common_audio/vad/vad_core.h" 12 13 #include "common_audio/signal_processing/include/signal_processing_library.h" 14 #include "common_audio/vad/vad_filterbank.h" 15 #include "common_audio/vad/vad_gmm.h" 16 #include "common_audio/vad/vad_sp.h" 17 #include "rtc_base/sanitizer.h" 18 19 // Spectrum Weighting 20 static const int16_t kSpectrumWeight[kNumChannels] = {6, 8, 10, 12, 14, 16}; 21 static const int16_t kNoiseUpdateConst = 655; // Q15 22 static const int16_t kSpeechUpdateConst = 6554; // Q15 23 static const int16_t kBackEta = 154; // Q8 24 // Minimum difference between the two models, Q5 25 static const int16_t kMinimumDifference[kNumChannels] = {544, 544, 576, 26 576, 576, 576}; 27 // Upper limit of mean value for speech model, Q7 28 static const int16_t kMaximumSpeech[kNumChannels] = {11392, 11392, 11520, 29 11520, 11520, 11520}; 30 // Minimum value for mean value 31 static const int16_t kMinimumMean[kNumGaussians] = {640, 768}; 32 // Upper limit of mean value for noise model, Q7 33 static const int16_t kMaximumNoise[kNumChannels] = {9216, 9088, 8960, 34 8832, 8704, 8576}; 35 // Start values for the Gaussian models, Q7 36 // Weights for the two Gaussians for the six channels (noise) 37 static const int16_t kNoiseDataWeights[kTableSize] = {34, 62, 72, 66, 53, 25, 38 94, 66, 56, 62, 75, 103}; 39 // Weights for the two Gaussians for the six channels (speech) 40 static const int16_t kSpeechDataWeights[kTableSize] = {48, 82, 45, 87, 50, 47, 41 80, 46, 83, 41, 78, 81}; 42 // Means for the two Gaussians for the six channels (noise) 43 static const int16_t kNoiseDataMeans[kTableSize] = { 44 6738, 4892, 7065, 6715, 6771, 3369, 7646, 3863, 7820, 7266, 5020, 4362}; 45 // Means for the two Gaussians for the six channels (speech) 46 static const int16_t kSpeechDataMeans[kTableSize] = {8306, 10085, 10078, 11823, 47 11843, 6309, 9473, 9571, 48 10879, 7581, 8180, 7483}; 49 // Stds for the two Gaussians for the six channels (noise) 50 static const int16_t kNoiseDataStds[kTableSize] = { 51 378, 1064, 493, 582, 688, 593, 474, 697, 475, 688, 421, 455}; 52 // Stds for the two Gaussians for the six channels (speech) 53 static const int16_t kSpeechDataStds[kTableSize] = { 54 555, 505, 567, 524, 585, 1231, 509, 828, 492, 1540, 1079, 850}; 55 56 // Constants used in GmmProbability(). 57 // 58 // Maximum number of counted speech (VAD = 1) frames in a row. 59 static const int16_t kMaxSpeechFrames = 6; 60 // Minimum standard deviation for both speech and noise. 61 static const int16_t kMinStd = 384; 62 63 // Constants in WebRtcVad_InitCore(). 64 // Default aggressiveness mode. 65 static const short kDefaultMode = 0; 66 static const int kInitCheck = 42; 67 68 // Constants used in WebRtcVad_set_mode_core(). 69 // 70 // Thresholds for different frame lengths (10 ms, 20 ms and 30 ms). 71 // 72 // Mode 0, Quality. 73 static const int16_t kOverHangMax1Q[3] = {8, 4, 3}; 74 static const int16_t kOverHangMax2Q[3] = {14, 7, 5}; 75 static const int16_t kLocalThresholdQ[3] = {24, 21, 24}; 76 static const int16_t kGlobalThresholdQ[3] = {57, 48, 57}; 77 // Mode 1, Low bitrate. 78 static const int16_t kOverHangMax1LBR[3] = {8, 4, 3}; 79 static const int16_t kOverHangMax2LBR[3] = {14, 7, 5}; 80 static const int16_t kLocalThresholdLBR[3] = {37, 32, 37}; 81 static const int16_t kGlobalThresholdLBR[3] = {100, 80, 100}; 82 // Mode 2, Aggressive. 83 static const int16_t kOverHangMax1AGG[3] = {6, 3, 2}; 84 static const int16_t kOverHangMax2AGG[3] = {9, 5, 3}; 85 static const int16_t kLocalThresholdAGG[3] = {82, 78, 82}; 86 static const int16_t kGlobalThresholdAGG[3] = {285, 260, 285}; 87 // Mode 3, Very aggressive. 88 static const int16_t kOverHangMax1VAG[3] = {6, 3, 2}; 89 static const int16_t kOverHangMax2VAG[3] = {9, 5, 3}; 90 static const int16_t kLocalThresholdVAG[3] = {94, 94, 94}; 91 static const int16_t kGlobalThresholdVAG[3] = {1100, 1050, 1100}; 92 93 // Calculates the weighted average w.r.t. number of Gaussians. The `data` are 94 // updated with an `offset` before averaging. 95 // 96 // - data [i/o] : Data to average. 97 // - offset [i] : An offset added to `data`. 98 // - weights [i] : Weights used for averaging. 99 // 100 // returns : The weighted average. 101 static int32_t WeightedAverage(int16_t* data, 102 int16_t offset, 103 const int16_t* weights) { 104 int k; 105 int32_t weighted_average = 0; 106 107 for (k = 0; k < kNumGaussians; k++) { 108 data[k * kNumChannels] += offset; 109 weighted_average += data[k * kNumChannels] * weights[k * kNumChannels]; 110 } 111 return weighted_average; 112 } 113 114 // An s16 x s32 -> s32 multiplication that's allowed to overflow. (It's still 115 // undefined behavior, so not a good idea; this just makes UBSan ignore the 116 // violation, so that our old code can continue to do what it's always been 117 // doing.) 118 static inline int32_t RTC_NO_SANITIZE("signed-integer-overflow") 119 OverflowingMulS16ByS32ToS32(int16_t a, int32_t b) { 120 return a * b; 121 } 122 123 // Calculates the probabilities for both speech and background noise using 124 // Gaussian Mixture Models (GMM). A hypothesis-test is performed to decide which 125 // type of signal is most probable. 126 // 127 // - self [i/o] : Pointer to VAD instance 128 // - features [i] : Feature vector of length `kNumChannels` 129 // = log10(energy in frequency band) 130 // - total_power [i] : Total power in audio frame. 131 // - frame_length [i] : Number of input samples 132 // 133 // - returns : the VAD decision (0 - noise, 1 - speech). 134 static int16_t GmmProbability(VadInstT* self, 135 int16_t* features, 136 int16_t total_power, 137 size_t frame_length) { 138 int channel, k; 139 int16_t feature_minimum; 140 int16_t h0, h1; 141 int16_t log_likelihood_ratio; 142 int16_t vadflag = 0; 143 int16_t shifts_h0, shifts_h1; 144 int16_t tmp_s16, tmp1_s16, tmp2_s16; 145 int16_t diff; 146 int gaussian; 147 int16_t nmk, nmk2, nmk3, smk, smk2, nsk, ssk; 148 int16_t delt, ndelt; 149 int16_t maxspe, maxmu; 150 int16_t deltaN[kTableSize], deltaS[kTableSize]; 151 int16_t ngprvec[kTableSize] = {0}; // Conditional probability = 0. 152 int16_t sgprvec[kTableSize] = {0}; // Conditional probability = 0. 153 int32_t h0_test, h1_test; 154 int32_t tmp1_s32, tmp2_s32; 155 int32_t sum_log_likelihood_ratios = 0; 156 int32_t noise_global_mean, speech_global_mean; 157 int32_t noise_probability[kNumGaussians], speech_probability[kNumGaussians]; 158 int16_t overhead1, overhead2, individualTest, totalTest; 159 160 // Set various thresholds based on frame lengths (80, 160 or 240 samples). 161 if (frame_length == 80) { 162 overhead1 = self->over_hang_max_1[0]; 163 overhead2 = self->over_hang_max_2[0]; 164 individualTest = self->individual[0]; 165 totalTest = self->total[0]; 166 } else if (frame_length == 160) { 167 overhead1 = self->over_hang_max_1[1]; 168 overhead2 = self->over_hang_max_2[1]; 169 individualTest = self->individual[1]; 170 totalTest = self->total[1]; 171 } else { 172 overhead1 = self->over_hang_max_1[2]; 173 overhead2 = self->over_hang_max_2[2]; 174 individualTest = self->individual[2]; 175 totalTest = self->total[2]; 176 } 177 178 if (total_power > kMinEnergy) { 179 // The signal power of current frame is large enough for processing. The 180 // processing consists of two parts: 181 // 1) Calculating the likelihood of speech and thereby a VAD decision. 182 // 2) Updating the underlying model, w.r.t., the decision made. 183 184 // The detection scheme is an LRT with hypothesis 185 // H0: Noise 186 // H1: Speech 187 // 188 // We combine a global LRT with local tests, for each frequency sub-band, 189 // here defined as `channel`. 190 for (channel = 0; channel < kNumChannels; channel++) { 191 // For each channel we model the probability with a GMM consisting of 192 // `kNumGaussians`, with different means and standard deviations depending 193 // on H0 or H1. 194 h0_test = 0; 195 h1_test = 0; 196 for (k = 0; k < kNumGaussians; k++) { 197 gaussian = channel + k * kNumChannels; 198 // Probability under H0, that is, probability of frame being noise. 199 // Value given in Q27 = Q7 * Q20. 200 tmp1_s32 = WebRtcVad_GaussianProbability( 201 features[channel], self->noise_means[gaussian], 202 self->noise_stds[gaussian], &deltaN[gaussian]); 203 noise_probability[k] = kNoiseDataWeights[gaussian] * tmp1_s32; 204 h0_test += noise_probability[k]; // Q27 205 206 // Probability under H1, that is, probability of frame being speech. 207 // Value given in Q27 = Q7 * Q20. 208 tmp1_s32 = WebRtcVad_GaussianProbability( 209 features[channel], self->speech_means[gaussian], 210 self->speech_stds[gaussian], &deltaS[gaussian]); 211 speech_probability[k] = kSpeechDataWeights[gaussian] * tmp1_s32; 212 h1_test += speech_probability[k]; // Q27 213 } 214 215 // Calculate the log likelihood ratio: log2(Pr{X|H1} / Pr{X|H1}). 216 // Approximation: 217 // log2(Pr{X|H1} / Pr{X|H1}) = log2(Pr{X|H1}*2^Q) - log2(Pr{X|H1}*2^Q) 218 // = log2(h1_test) - log2(h0_test) 219 // = log2(2^(31-shifts_h1)*(1+b1)) 220 // - log2(2^(31-shifts_h0)*(1+b0)) 221 // = shifts_h0 - shifts_h1 222 // + log2(1+b1) - log2(1+b0) 223 // ~= shifts_h0 - shifts_h1 224 // 225 // Note that b0 and b1 are values less than 1, hence, 0 <= log2(1+b0) < 1. 226 // Further, b0 and b1 are independent and on the average the two terms 227 // cancel. 228 shifts_h0 = WebRtcSpl_NormW32(h0_test); 229 shifts_h1 = WebRtcSpl_NormW32(h1_test); 230 if (h0_test == 0) { 231 shifts_h0 = 31; 232 } 233 if (h1_test == 0) { 234 shifts_h1 = 31; 235 } 236 log_likelihood_ratio = shifts_h0 - shifts_h1; 237 238 // Update `sum_log_likelihood_ratios` with spectrum weighting. This is 239 // used for the global VAD decision. 240 sum_log_likelihood_ratios += 241 (int32_t)(log_likelihood_ratio * kSpectrumWeight[channel]); 242 243 // Local VAD decision. 244 if ((log_likelihood_ratio * 4) > individualTest) { 245 vadflag = 1; 246 } 247 248 // TODO(bjornv): The conditional probabilities below are applied on the 249 // hard coded number of Gaussians set to two. Find a way to generalize. 250 // Calculate local noise probabilities used later when updating the GMM. 251 h0 = (int16_t)(h0_test >> 12); // Q15 252 if (h0 > 0) { 253 // High probability of noise. Assign conditional probabilities for each 254 // Gaussian in the GMM. 255 tmp1_s32 = (noise_probability[0] & 0xFFFFF000) << 2; // Q29 256 ngprvec[channel] = (int16_t)WebRtcSpl_DivW32W16(tmp1_s32, h0); // Q14 257 ngprvec[channel + kNumChannels] = 16384 - ngprvec[channel]; 258 } else { 259 // Low noise probability. Assign conditional probability 1 to the first 260 // Gaussian and 0 to the rest (which is already set at initialization). 261 ngprvec[channel] = 16384; 262 } 263 264 // Calculate local speech probabilities used later when updating the GMM. 265 h1 = (int16_t)(h1_test >> 12); // Q15 266 if (h1 > 0) { 267 // High probability of speech. Assign conditional probabilities for each 268 // Gaussian in the GMM. Otherwise use the initialized values, i.e., 0. 269 tmp1_s32 = (speech_probability[0] & 0xFFFFF000) << 2; // Q29 270 sgprvec[channel] = (int16_t)WebRtcSpl_DivW32W16(tmp1_s32, h1); // Q14 271 sgprvec[channel + kNumChannels] = 16384 - sgprvec[channel]; 272 } 273 } 274 275 // Make a global VAD decision. 276 vadflag |= (sum_log_likelihood_ratios >= totalTest); 277 278 // Update the model parameters. 279 maxspe = 12800; 280 for (channel = 0; channel < kNumChannels; channel++) { 281 // Get minimum value in past which is used for long term correction in Q4. 282 feature_minimum = WebRtcVad_FindMinimum(self, features[channel], channel); 283 284 // Compute the "global" mean, that is the sum of the two means weighted. 285 noise_global_mean = WeightedAverage(&self->noise_means[channel], 0, 286 &kNoiseDataWeights[channel]); 287 tmp1_s16 = (int16_t)(noise_global_mean >> 6); // Q8 288 289 for (k = 0; k < kNumGaussians; k++) { 290 gaussian = channel + k * kNumChannels; 291 292 nmk = self->noise_means[gaussian]; 293 smk = self->speech_means[gaussian]; 294 nsk = self->noise_stds[gaussian]; 295 ssk = self->speech_stds[gaussian]; 296 297 // Update noise mean vector if the frame consists of noise only. 298 nmk2 = nmk; 299 if (!vadflag) { 300 // deltaN = (x-mu)/sigma^2 301 // ngprvec[k] = `noise_probability[k]` / 302 // (`noise_probability[0]` + `noise_probability[1]`) 303 304 // (Q14 * Q11 >> 11) = Q14. 305 delt = (int16_t)((ngprvec[gaussian] * deltaN[gaussian]) >> 11); 306 // Q7 + (Q14 * Q15 >> 22) = Q7. 307 nmk2 = nmk + (int16_t)((delt * kNoiseUpdateConst) >> 22); 308 } 309 310 // Long term correction of the noise mean. 311 // Q8 - Q8 = Q8. 312 ndelt = (feature_minimum << 4) - tmp1_s16; 313 // Q7 + (Q8 * Q8) >> 9 = Q7. 314 nmk3 = nmk2 + (int16_t)((ndelt * kBackEta) >> 9); 315 316 // Control that the noise mean does not drift to much. 317 tmp_s16 = (int16_t)((k + 5) << 7); 318 if (nmk3 < tmp_s16) { 319 nmk3 = tmp_s16; 320 } 321 tmp_s16 = (int16_t)((72 + k - channel) << 7); 322 if (nmk3 > tmp_s16) { 323 nmk3 = tmp_s16; 324 } 325 self->noise_means[gaussian] = nmk3; 326 327 if (vadflag) { 328 // Update speech mean vector: 329 // `deltaS` = (x-mu)/sigma^2 330 // sgprvec[k] = `speech_probability[k]` / 331 // (`speech_probability[0]` + `speech_probability[1]`) 332 333 // (Q14 * Q11) >> 11 = Q14. 334 delt = (int16_t)((sgprvec[gaussian] * deltaS[gaussian]) >> 11); 335 // Q14 * Q15 >> 21 = Q8. 336 tmp_s16 = (int16_t)((delt * kSpeechUpdateConst) >> 21); 337 // Q7 + (Q8 >> 1) = Q7. With rounding. 338 smk2 = smk + ((tmp_s16 + 1) >> 1); 339 340 // Control that the speech mean does not drift to much. 341 maxmu = maxspe + 640; 342 if (smk2 < kMinimumMean[k]) { 343 smk2 = kMinimumMean[k]; 344 } 345 if (smk2 > maxmu) { 346 smk2 = maxmu; 347 } 348 self->speech_means[gaussian] = smk2; // Q7. 349 350 // (Q7 >> 3) = Q4. With rounding. 351 tmp_s16 = ((smk + 4) >> 3); 352 353 tmp_s16 = features[channel] - tmp_s16; // Q4 354 // (Q11 * Q4 >> 3) = Q12. 355 tmp1_s32 = (deltaS[gaussian] * tmp_s16) >> 3; 356 tmp2_s32 = tmp1_s32 - 4096; 357 tmp_s16 = sgprvec[gaussian] >> 2; 358 // (Q14 >> 2) * Q12 = Q24. 359 tmp1_s32 = tmp_s16 * tmp2_s32; 360 361 tmp2_s32 = tmp1_s32 >> 4; // Q20 362 363 // 0.1 * Q20 / Q7 = Q13. 364 if (tmp2_s32 > 0) { 365 tmp_s16 = (int16_t)WebRtcSpl_DivW32W16(tmp2_s32, ssk * 10); 366 } else { 367 tmp_s16 = (int16_t)WebRtcSpl_DivW32W16(-tmp2_s32, ssk * 10); 368 tmp_s16 = -tmp_s16; 369 } 370 // Divide by 4 giving an update factor of 0.025 (= 0.1 / 4). 371 // Note that division by 4 equals shift by 2, hence, 372 // (Q13 >> 8) = (Q13 >> 6) / 4 = Q7. 373 tmp_s16 += 128; // Rounding. 374 ssk += (tmp_s16 >> 8); 375 if (ssk < kMinStd) { 376 ssk = kMinStd; 377 } 378 self->speech_stds[gaussian] = ssk; 379 } else { 380 // Update GMM variance vectors. 381 // deltaN * (features[channel] - nmk) - 1 382 // Q4 - (Q7 >> 3) = Q4. 383 tmp_s16 = features[channel] - (nmk >> 3); 384 // (Q11 * Q4 >> 3) = Q12. 385 tmp1_s32 = (deltaN[gaussian] * tmp_s16) >> 3; 386 tmp1_s32 -= 4096; 387 388 // (Q14 >> 2) * Q12 = Q24. 389 tmp_s16 = (ngprvec[gaussian] + 2) >> 2; 390 tmp2_s32 = OverflowingMulS16ByS32ToS32(tmp_s16, tmp1_s32); 391 // Q20 * approx 0.001 (2^-10=0.0009766), hence, 392 // (Q24 >> 14) = (Q24 >> 4) / 2^10 = Q20. 393 tmp1_s32 = tmp2_s32 >> 14; 394 395 // Q20 / Q7 = Q13. 396 if (tmp1_s32 > 0) { 397 tmp_s16 = (int16_t)WebRtcSpl_DivW32W16(tmp1_s32, nsk); 398 } else { 399 tmp_s16 = (int16_t)WebRtcSpl_DivW32W16(-tmp1_s32, nsk); 400 tmp_s16 = -tmp_s16; 401 } 402 tmp_s16 += 32; // Rounding 403 nsk += tmp_s16 >> 6; // Q13 >> 6 = Q7. 404 if (nsk < kMinStd) { 405 nsk = kMinStd; 406 } 407 self->noise_stds[gaussian] = nsk; 408 } 409 } 410 411 // Separate models if they are too close. 412 // `noise_global_mean` in Q14 (= Q7 * Q7). 413 noise_global_mean = WeightedAverage(&self->noise_means[channel], 0, 414 &kNoiseDataWeights[channel]); 415 416 // `speech_global_mean` in Q14 (= Q7 * Q7). 417 speech_global_mean = WeightedAverage(&self->speech_means[channel], 0, 418 &kSpeechDataWeights[channel]); 419 420 // `diff` = "global" speech mean - "global" noise mean. 421 // (Q14 >> 9) - (Q14 >> 9) = Q5. 422 diff = (int16_t)(speech_global_mean >> 9) - 423 (int16_t)(noise_global_mean >> 9); 424 if (diff < kMinimumDifference[channel]) { 425 tmp_s16 = kMinimumDifference[channel] - diff; 426 427 // `tmp1_s16` = ~0.8 * (kMinimumDifference - diff) in Q7. 428 // `tmp2_s16` = ~0.2 * (kMinimumDifference - diff) in Q7. 429 tmp1_s16 = (int16_t)((13 * tmp_s16) >> 2); 430 tmp2_s16 = (int16_t)((3 * tmp_s16) >> 2); 431 432 // Move Gaussian means for speech model by `tmp1_s16` and update 433 // `speech_global_mean`. Note that `self->speech_means[channel]` is 434 // changed after the call. 435 speech_global_mean = 436 WeightedAverage(&self->speech_means[channel], tmp1_s16, 437 &kSpeechDataWeights[channel]); 438 439 // Move Gaussian means for noise model by -`tmp2_s16` and update 440 // `noise_global_mean`. Note that `self->noise_means[channel]` is 441 // changed after the call. 442 noise_global_mean = 443 WeightedAverage(&self->noise_means[channel], -tmp2_s16, 444 &kNoiseDataWeights[channel]); 445 } 446 447 // Control that the speech & noise means do not drift to much. 448 maxspe = kMaximumSpeech[channel]; 449 tmp2_s16 = (int16_t)(speech_global_mean >> 7); 450 if (tmp2_s16 > maxspe) { 451 // Upper limit of speech model. 452 tmp2_s16 -= maxspe; 453 454 for (k = 0; k < kNumGaussians; k++) { 455 self->speech_means[channel + k * kNumChannels] -= tmp2_s16; 456 } 457 } 458 459 tmp2_s16 = (int16_t)(noise_global_mean >> 7); 460 if (tmp2_s16 > kMaximumNoise[channel]) { 461 tmp2_s16 -= kMaximumNoise[channel]; 462 463 for (k = 0; k < kNumGaussians; k++) { 464 self->noise_means[channel + k * kNumChannels] -= tmp2_s16; 465 } 466 } 467 } 468 self->frame_counter++; 469 } 470 471 // Smooth with respect to transition hysteresis. 472 if (!vadflag) { 473 if (self->over_hang > 0) { 474 vadflag = 2 + self->over_hang; 475 self->over_hang--; 476 } 477 self->num_of_speech = 0; 478 } else { 479 self->num_of_speech++; 480 if (self->num_of_speech > kMaxSpeechFrames) { 481 self->num_of_speech = kMaxSpeechFrames; 482 self->over_hang = overhead2; 483 } else { 484 self->over_hang = overhead1; 485 } 486 } 487 return vadflag; 488 } 489 490 // Initialize the VAD. Set aggressiveness mode to default value. 491 int WebRtcVad_InitCore(VadInstT* self) { 492 int i; 493 494 if (self == NULL) { 495 return -1; 496 } 497 498 // Initialization of general struct variables. 499 self->vad = 1; // Speech active (=1). 500 self->frame_counter = 0; 501 self->over_hang = 0; 502 self->num_of_speech = 0; 503 504 // Initialization of downsampling filter state. 505 memset(self->downsampling_filter_states, 0, 506 sizeof(self->downsampling_filter_states)); 507 508 // Initialization of 48 to 8 kHz downsampling. 509 WebRtcSpl_ResetResample48khzTo8khz(&self->state_48_to_8); 510 511 // Read initial PDF parameters. 512 for (i = 0; i < kTableSize; i++) { 513 self->noise_means[i] = kNoiseDataMeans[i]; 514 self->speech_means[i] = kSpeechDataMeans[i]; 515 self->noise_stds[i] = kNoiseDataStds[i]; 516 self->speech_stds[i] = kSpeechDataStds[i]; 517 } 518 519 // Initialize Index and Minimum value vectors. 520 for (i = 0; i < 16 * kNumChannels; i++) { 521 self->low_value_vector[i] = 10000; 522 self->index_vector[i] = 0; 523 } 524 525 // Initialize splitting filter states. 526 memset(self->upper_state, 0, sizeof(self->upper_state)); 527 memset(self->lower_state, 0, sizeof(self->lower_state)); 528 529 // Initialize high pass filter states. 530 memset(self->hp_filter_state, 0, sizeof(self->hp_filter_state)); 531 532 // Initialize mean value memory, for WebRtcVad_FindMinimum(). 533 for (i = 0; i < kNumChannels; i++) { 534 self->mean_value[i] = 1600; 535 } 536 537 // Set aggressiveness mode to default (=`kDefaultMode`). 538 if (WebRtcVad_set_mode_core(self, kDefaultMode) != 0) { 539 return -1; 540 } 541 542 self->init_flag = kInitCheck; 543 544 return 0; 545 } 546 547 // Set aggressiveness mode 548 int WebRtcVad_set_mode_core(VadInstT* self, int mode) { 549 int return_value = 0; 550 551 switch (mode) { 552 case 0: 553 // Quality mode. 554 memcpy(self->over_hang_max_1, kOverHangMax1Q, 555 sizeof(self->over_hang_max_1)); 556 memcpy(self->over_hang_max_2, kOverHangMax2Q, 557 sizeof(self->over_hang_max_2)); 558 memcpy(self->individual, kLocalThresholdQ, sizeof(self->individual)); 559 memcpy(self->total, kGlobalThresholdQ, sizeof(self->total)); 560 break; 561 case 1: 562 // Low bitrate mode. 563 memcpy(self->over_hang_max_1, kOverHangMax1LBR, 564 sizeof(self->over_hang_max_1)); 565 memcpy(self->over_hang_max_2, kOverHangMax2LBR, 566 sizeof(self->over_hang_max_2)); 567 memcpy(self->individual, kLocalThresholdLBR, sizeof(self->individual)); 568 memcpy(self->total, kGlobalThresholdLBR, sizeof(self->total)); 569 break; 570 case 2: 571 // Aggressive mode. 572 memcpy(self->over_hang_max_1, kOverHangMax1AGG, 573 sizeof(self->over_hang_max_1)); 574 memcpy(self->over_hang_max_2, kOverHangMax2AGG, 575 sizeof(self->over_hang_max_2)); 576 memcpy(self->individual, kLocalThresholdAGG, sizeof(self->individual)); 577 memcpy(self->total, kGlobalThresholdAGG, sizeof(self->total)); 578 break; 579 case 3: 580 // Very aggressive mode. 581 memcpy(self->over_hang_max_1, kOverHangMax1VAG, 582 sizeof(self->over_hang_max_1)); 583 memcpy(self->over_hang_max_2, kOverHangMax2VAG, 584 sizeof(self->over_hang_max_2)); 585 memcpy(self->individual, kLocalThresholdVAG, sizeof(self->individual)); 586 memcpy(self->total, kGlobalThresholdVAG, sizeof(self->total)); 587 break; 588 default: 589 return_value = -1; 590 break; 591 } 592 593 return return_value; 594 } 595 596 // Calculate VAD decision by first extracting feature values and then calculate 597 // probability for both speech and background noise. 598 599 int WebRtcVad_CalcVad48khz(VadInstT* inst, 600 const int16_t* speech_frame, 601 size_t frame_length) { 602 int vad; 603 size_t i; 604 int16_t speech_nb[240]; // 30 ms in 8 kHz. 605 // `tmp_mem` is a temporary memory used by resample function, length is 606 // frame length in 10 ms (480 samples) + 256 extra. 607 int32_t tmp_mem[480 + 256] = {0}; 608 const size_t kFrameLen10ms48khz = 480; 609 const size_t kFrameLen10ms8khz = 80; 610 size_t num_10ms_frames = frame_length / kFrameLen10ms48khz; 611 612 for (i = 0; i < num_10ms_frames; i++) { 613 WebRtcSpl_Resample48khzTo8khz(speech_frame, 614 &speech_nb[i * kFrameLen10ms8khz], 615 &inst->state_48_to_8, tmp_mem); 616 } 617 618 // Do VAD on an 8 kHz signal 619 vad = WebRtcVad_CalcVad8khz(inst, speech_nb, frame_length / 6); 620 621 return vad; 622 } 623 624 int WebRtcVad_CalcVad32khz(VadInstT* inst, 625 const int16_t* speech_frame, 626 size_t frame_length) { 627 size_t len; 628 int vad; 629 int16_t speechWB[480]; // Downsampled speech frame: 960 samples (30ms in SWB) 630 int16_t speechNB[240]; // Downsampled speech frame: 480 samples (30ms in WB) 631 632 // Downsample signal 32->16->8 before doing VAD 633 WebRtcVad_Downsampling(speech_frame, speechWB, 634 &(inst->downsampling_filter_states[2]), frame_length); 635 len = frame_length / 2; 636 637 WebRtcVad_Downsampling(speechWB, speechNB, inst->downsampling_filter_states, 638 len); 639 len /= 2; 640 641 // Do VAD on an 8 kHz signal 642 vad = WebRtcVad_CalcVad8khz(inst, speechNB, len); 643 644 return vad; 645 } 646 647 int WebRtcVad_CalcVad16khz(VadInstT* inst, 648 const int16_t* speech_frame, 649 size_t frame_length) { 650 size_t len; 651 int vad; 652 int16_t speechNB[240]; // Downsampled speech frame: 480 samples (30ms in WB) 653 654 // Wideband: Downsample signal before doing VAD 655 WebRtcVad_Downsampling(speech_frame, speechNB, 656 inst->downsampling_filter_states, frame_length); 657 658 len = frame_length / 2; 659 vad = WebRtcVad_CalcVad8khz(inst, speechNB, len); 660 661 return vad; 662 } 663 664 int WebRtcVad_CalcVad8khz(VadInstT* inst, 665 const int16_t* speech_frame, 666 size_t frame_length) { 667 int16_t feature_vector[kNumChannels], total_power; 668 669 // Get power in the bands 670 total_power = WebRtcVad_CalculateFeatures(inst, speech_frame, frame_length, 671 feature_vector); 672 673 // Make a VAD 674 inst->vad = GmmProbability(inst, feature_vector, total_power, frame_length); 675 676 return inst->vad; 677 }