[ tor-browser ].git.dasho

vad_core.c (26200B)
      1 /*
      2 *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
      3 *
      4 *  Use of this source code is governed by a BSD-style license
      5 *  that can be found in the LICENSE file in the root of the source
      6 *  tree. An additional intellectual property rights grant can be found
      7 *  in the file PATENTS.  All contributing project authors may
      8 *  be found in the AUTHORS file in the root of the source tree.
      9 */
     10 
     11 #include "common_audio/vad/vad_core.h"
     12 
     13 #include "common_audio/signal_processing/include/signal_processing_library.h"
     14 #include "common_audio/vad/vad_filterbank.h"
     15 #include "common_audio/vad/vad_gmm.h"
     16 #include "common_audio/vad/vad_sp.h"
     17 #include "rtc_base/sanitizer.h"
     18 
     19 // Spectrum Weighting
     20 static const int16_t kSpectrumWeight[kNumChannels] = {6, 8, 10, 12, 14, 16};
     21 static const int16_t kNoiseUpdateConst = 655;    // Q15
     22 static const int16_t kSpeechUpdateConst = 6554;  // Q15
     23 static const int16_t kBackEta = 154;             // Q8
     24 // Minimum difference between the two models, Q5
     25 static const int16_t kMinimumDifference[kNumChannels] = {544, 544, 576,
     26                                                         576, 576, 576};
     27 // Upper limit of mean value for speech model, Q7
     28 static const int16_t kMaximumSpeech[kNumChannels] = {11392, 11392, 11520,
     29                                                     11520, 11520, 11520};
     30 // Minimum value for mean value
     31 static const int16_t kMinimumMean[kNumGaussians] = {640, 768};
     32 // Upper limit of mean value for noise model, Q7
     33 static const int16_t kMaximumNoise[kNumChannels] = {9216, 9088, 8960,
     34                                                    8832, 8704, 8576};
     35 // Start values for the Gaussian models, Q7
     36 // Weights for the two Gaussians for the six channels (noise)
     37 static const int16_t kNoiseDataWeights[kTableSize] = {34, 62, 72, 66, 53, 25,
     38                                                      94, 66, 56, 62, 75, 103};
     39 // Weights for the two Gaussians for the six channels (speech)
     40 static const int16_t kSpeechDataWeights[kTableSize] = {48, 82, 45, 87, 50, 47,
     41                                                       80, 46, 83, 41, 78, 81};
     42 // Means for the two Gaussians for the six channels (noise)
     43 static const int16_t kNoiseDataMeans[kTableSize] = {
     44    6738, 4892, 7065, 6715, 6771, 3369, 7646, 3863, 7820, 7266, 5020, 4362};
     45 // Means for the two Gaussians for the six channels (speech)
     46 static const int16_t kSpeechDataMeans[kTableSize] = {8306,  10085, 10078, 11823,
     47                                                     11843, 6309,  9473,  9571,
     48                                                     10879, 7581,  8180,  7483};
     49 // Stds for the two Gaussians for the six channels (noise)
     50 static const int16_t kNoiseDataStds[kTableSize] = {
     51    378, 1064, 493, 582, 688, 593, 474, 697, 475, 688, 421, 455};
     52 // Stds for the two Gaussians for the six channels (speech)
     53 static const int16_t kSpeechDataStds[kTableSize] = {
     54    555, 505, 567, 524, 585, 1231, 509, 828, 492, 1540, 1079, 850};
     55 
     56 // Constants used in GmmProbability().
     57 //
     58 // Maximum number of counted speech (VAD = 1) frames in a row.
     59 static const int16_t kMaxSpeechFrames = 6;
     60 // Minimum standard deviation for both speech and noise.
     61 static const int16_t kMinStd = 384;
     62 
     63 // Constants in WebRtcVad_InitCore().
     64 // Default aggressiveness mode.
     65 static const short kDefaultMode = 0;
     66 static const int kInitCheck = 42;
     67 
     68 // Constants used in WebRtcVad_set_mode_core().
     69 //
     70 // Thresholds for different frame lengths (10 ms, 20 ms and 30 ms).
     71 //
     72 // Mode 0, Quality.
     73 static const int16_t kOverHangMax1Q[3] = {8, 4, 3};
     74 static const int16_t kOverHangMax2Q[3] = {14, 7, 5};
     75 static const int16_t kLocalThresholdQ[3] = {24, 21, 24};
     76 static const int16_t kGlobalThresholdQ[3] = {57, 48, 57};
     77 // Mode 1, Low bitrate.
     78 static const int16_t kOverHangMax1LBR[3] = {8, 4, 3};
     79 static const int16_t kOverHangMax2LBR[3] = {14, 7, 5};
     80 static const int16_t kLocalThresholdLBR[3] = {37, 32, 37};
     81 static const int16_t kGlobalThresholdLBR[3] = {100, 80, 100};
     82 // Mode 2, Aggressive.
     83 static const int16_t kOverHangMax1AGG[3] = {6, 3, 2};
     84 static const int16_t kOverHangMax2AGG[3] = {9, 5, 3};
     85 static const int16_t kLocalThresholdAGG[3] = {82, 78, 82};
     86 static const int16_t kGlobalThresholdAGG[3] = {285, 260, 285};
     87 // Mode 3, Very aggressive.
     88 static const int16_t kOverHangMax1VAG[3] = {6, 3, 2};
     89 static const int16_t kOverHangMax2VAG[3] = {9, 5, 3};
     90 static const int16_t kLocalThresholdVAG[3] = {94, 94, 94};
     91 static const int16_t kGlobalThresholdVAG[3] = {1100, 1050, 1100};
     92 
     93 // Calculates the weighted average w.r.t. number of Gaussians. The `data` are
     94 // updated with an `offset` before averaging.
     95 //
     96 // - data     [i/o] : Data to average.
     97 // - offset   [i]   : An offset added to `data`.
     98 // - weights  [i]   : Weights used for averaging.
     99 //
    100 // returns          : The weighted average.
    101 static int32_t WeightedAverage(int16_t* data,
    102                               int16_t offset,
    103                               const int16_t* weights) {
    104  int k;
    105  int32_t weighted_average = 0;
    106 
    107  for (k = 0; k < kNumGaussians; k++) {
    108    data[k * kNumChannels] += offset;
    109    weighted_average += data[k * kNumChannels] * weights[k * kNumChannels];
    110  }
    111  return weighted_average;
    112 }
    113 
    114 // An s16 x s32 -> s32 multiplication that's allowed to overflow. (It's still
    115 // undefined behavior, so not a good idea; this just makes UBSan ignore the
    116 // violation, so that our old code can continue to do what it's always been
    117 // doing.)
    118 static inline int32_t RTC_NO_SANITIZE("signed-integer-overflow")
    119    OverflowingMulS16ByS32ToS32(int16_t a, int32_t b) {
    120  return a * b;
    121 }
    122 
    123 // Calculates the probabilities for both speech and background noise using
    124 // Gaussian Mixture Models (GMM). A hypothesis-test is performed to decide which
    125 // type of signal is most probable.
    126 //
    127 // - self           [i/o] : Pointer to VAD instance
    128 // - features       [i]   : Feature vector of length `kNumChannels`
    129 //                          = log10(energy in frequency band)
    130 // - total_power    [i]   : Total power in audio frame.
    131 // - frame_length   [i]   : Number of input samples
    132 //
    133 // - returns              : the VAD decision (0 - noise, 1 - speech).
    134 static int16_t GmmProbability(VadInstT* self,
    135                              int16_t* features,
    136                              int16_t total_power,
    137                              size_t frame_length) {
    138  int channel, k;
    139  int16_t feature_minimum;
    140  int16_t h0, h1;
    141  int16_t log_likelihood_ratio;
    142  int16_t vadflag = 0;
    143  int16_t shifts_h0, shifts_h1;
    144  int16_t tmp_s16, tmp1_s16, tmp2_s16;
    145  int16_t diff;
    146  int gaussian;
    147  int16_t nmk, nmk2, nmk3, smk, smk2, nsk, ssk;
    148  int16_t delt, ndelt;
    149  int16_t maxspe, maxmu;
    150  int16_t deltaN[kTableSize], deltaS[kTableSize];
    151  int16_t ngprvec[kTableSize] = {0};  // Conditional probability = 0.
    152  int16_t sgprvec[kTableSize] = {0};  // Conditional probability = 0.
    153  int32_t h0_test, h1_test;
    154  int32_t tmp1_s32, tmp2_s32;
    155  int32_t sum_log_likelihood_ratios = 0;
    156  int32_t noise_global_mean, speech_global_mean;
    157  int32_t noise_probability[kNumGaussians], speech_probability[kNumGaussians];
    158  int16_t overhead1, overhead2, individualTest, totalTest;
    159 
    160  // Set various thresholds based on frame lengths (80, 160 or 240 samples).
    161  if (frame_length == 80) {
    162    overhead1 = self->over_hang_max_1[0];
    163    overhead2 = self->over_hang_max_2[0];
    164    individualTest = self->individual[0];
    165    totalTest = self->total[0];
    166  } else if (frame_length == 160) {
    167    overhead1 = self->over_hang_max_1[1];
    168    overhead2 = self->over_hang_max_2[1];
    169    individualTest = self->individual[1];
    170    totalTest = self->total[1];
    171  } else {
    172    overhead1 = self->over_hang_max_1[2];
    173    overhead2 = self->over_hang_max_2[2];
    174    individualTest = self->individual[2];
    175    totalTest = self->total[2];
    176  }
    177 
    178  if (total_power > kMinEnergy) {
    179    // The signal power of current frame is large enough for processing. The
    180    // processing consists of two parts:
    181    // 1) Calculating the likelihood of speech and thereby a VAD decision.
    182    // 2) Updating the underlying model, w.r.t., the decision made.
    183 
    184    // The detection scheme is an LRT with hypothesis
    185    // H0: Noise
    186    // H1: Speech
    187    //
    188    // We combine a global LRT with local tests, for each frequency sub-band,
    189    // here defined as `channel`.
    190    for (channel = 0; channel < kNumChannels; channel++) {
    191      // For each channel we model the probability with a GMM consisting of
    192      // `kNumGaussians`, with different means and standard deviations depending
    193      // on H0 or H1.
    194      h0_test = 0;
    195      h1_test = 0;
    196      for (k = 0; k < kNumGaussians; k++) {
    197        gaussian = channel + k * kNumChannels;
    198        // Probability under H0, that is, probability of frame being noise.
    199        // Value given in Q27 = Q7 * Q20.
    200        tmp1_s32 = WebRtcVad_GaussianProbability(
    201            features[channel], self->noise_means[gaussian],
    202            self->noise_stds[gaussian], &deltaN[gaussian]);
    203        noise_probability[k] = kNoiseDataWeights[gaussian] * tmp1_s32;
    204        h0_test += noise_probability[k];  // Q27
    205 
    206        // Probability under H1, that is, probability of frame being speech.
    207        // Value given in Q27 = Q7 * Q20.
    208        tmp1_s32 = WebRtcVad_GaussianProbability(
    209            features[channel], self->speech_means[gaussian],
    210            self->speech_stds[gaussian], &deltaS[gaussian]);
    211        speech_probability[k] = kSpeechDataWeights[gaussian] * tmp1_s32;
    212        h1_test += speech_probability[k];  // Q27
    213      }
    214 
    215      // Calculate the log likelihood ratio: log2(Pr{X|H1} / Pr{X|H1}).
    216      // Approximation:
    217      // log2(Pr{X|H1} / Pr{X|H1}) = log2(Pr{X|H1}*2^Q) - log2(Pr{X|H1}*2^Q)
    218      //                           = log2(h1_test) - log2(h0_test)
    219      //                           = log2(2^(31-shifts_h1)*(1+b1))
    220      //                             - log2(2^(31-shifts_h0)*(1+b0))
    221      //                           = shifts_h0 - shifts_h1
    222      //                             + log2(1+b1) - log2(1+b0)
    223      //                          ~= shifts_h0 - shifts_h1
    224      //
    225      // Note that b0 and b1 are values less than 1, hence, 0 <= log2(1+b0) < 1.
    226      // Further, b0 and b1 are independent and on the average the two terms
    227      // cancel.
    228      shifts_h0 = WebRtcSpl_NormW32(h0_test);
    229      shifts_h1 = WebRtcSpl_NormW32(h1_test);
    230      if (h0_test == 0) {
    231        shifts_h0 = 31;
    232      }
    233      if (h1_test == 0) {
    234        shifts_h1 = 31;
    235      }
    236      log_likelihood_ratio = shifts_h0 - shifts_h1;
    237 
    238      // Update `sum_log_likelihood_ratios` with spectrum weighting. This is
    239      // used for the global VAD decision.
    240      sum_log_likelihood_ratios +=
    241          (int32_t)(log_likelihood_ratio * kSpectrumWeight[channel]);
    242 
    243      // Local VAD decision.
    244      if ((log_likelihood_ratio * 4) > individualTest) {
    245        vadflag = 1;
    246      }
    247 
    248      // TODO(bjornv): The conditional probabilities below are applied on the
    249      // hard coded number of Gaussians set to two. Find a way to generalize.
    250      // Calculate local noise probabilities used later when updating the GMM.
    251      h0 = (int16_t)(h0_test >> 12);  // Q15
    252      if (h0 > 0) {
    253        // High probability of noise. Assign conditional probabilities for each
    254        // Gaussian in the GMM.
    255        tmp1_s32 = (noise_probability[0] & 0xFFFFF000) << 2;            // Q29
    256        ngprvec[channel] = (int16_t)WebRtcSpl_DivW32W16(tmp1_s32, h0);  // Q14
    257        ngprvec[channel + kNumChannels] = 16384 - ngprvec[channel];
    258      } else {
    259        // Low noise probability. Assign conditional probability 1 to the first
    260        // Gaussian and 0 to the rest (which is already set at initialization).
    261        ngprvec[channel] = 16384;
    262      }
    263 
    264      // Calculate local speech probabilities used later when updating the GMM.
    265      h1 = (int16_t)(h1_test >> 12);  // Q15
    266      if (h1 > 0) {
    267        // High probability of speech. Assign conditional probabilities for each
    268        // Gaussian in the GMM. Otherwise use the initialized values, i.e., 0.
    269        tmp1_s32 = (speech_probability[0] & 0xFFFFF000) << 2;           // Q29
    270        sgprvec[channel] = (int16_t)WebRtcSpl_DivW32W16(tmp1_s32, h1);  // Q14
    271        sgprvec[channel + kNumChannels] = 16384 - sgprvec[channel];
    272      }
    273    }
    274 
    275    // Make a global VAD decision.
    276    vadflag |= (sum_log_likelihood_ratios >= totalTest);
    277 
    278    // Update the model parameters.
    279    maxspe = 12800;
    280    for (channel = 0; channel < kNumChannels; channel++) {
    281      // Get minimum value in past which is used for long term correction in Q4.
    282      feature_minimum = WebRtcVad_FindMinimum(self, features[channel], channel);
    283 
    284      // Compute the "global" mean, that is the sum of the two means weighted.
    285      noise_global_mean = WeightedAverage(&self->noise_means[channel], 0,
    286                                          &kNoiseDataWeights[channel]);
    287      tmp1_s16 = (int16_t)(noise_global_mean >> 6);  // Q8
    288 
    289      for (k = 0; k < kNumGaussians; k++) {
    290        gaussian = channel + k * kNumChannels;
    291 
    292        nmk = self->noise_means[gaussian];
    293        smk = self->speech_means[gaussian];
    294        nsk = self->noise_stds[gaussian];
    295        ssk = self->speech_stds[gaussian];
    296 
    297        // Update noise mean vector if the frame consists of noise only.
    298        nmk2 = nmk;
    299        if (!vadflag) {
    300          // deltaN = (x-mu)/sigma^2
    301          // ngprvec[k] = `noise_probability[k]` /
    302          //   (`noise_probability[0]` + `noise_probability[1]`)
    303 
    304          // (Q14 * Q11 >> 11) = Q14.
    305          delt = (int16_t)((ngprvec[gaussian] * deltaN[gaussian]) >> 11);
    306          // Q7 + (Q14 * Q15 >> 22) = Q7.
    307          nmk2 = nmk + (int16_t)((delt * kNoiseUpdateConst) >> 22);
    308        }
    309 
    310        // Long term correction of the noise mean.
    311        // Q8 - Q8 = Q8.
    312        ndelt = (feature_minimum << 4) - tmp1_s16;
    313        // Q7 + (Q8 * Q8) >> 9 = Q7.
    314        nmk3 = nmk2 + (int16_t)((ndelt * kBackEta) >> 9);
    315 
    316        // Control that the noise mean does not drift to much.
    317        tmp_s16 = (int16_t)((k + 5) << 7);
    318        if (nmk3 < tmp_s16) {
    319          nmk3 = tmp_s16;
    320        }
    321        tmp_s16 = (int16_t)((72 + k - channel) << 7);
    322        if (nmk3 > tmp_s16) {
    323          nmk3 = tmp_s16;
    324        }
    325        self->noise_means[gaussian] = nmk3;
    326 
    327        if (vadflag) {
    328          // Update speech mean vector:
    329          // `deltaS` = (x-mu)/sigma^2
    330          // sgprvec[k] = `speech_probability[k]` /
    331          //   (`speech_probability[0]` + `speech_probability[1]`)
    332 
    333          // (Q14 * Q11) >> 11 = Q14.
    334          delt = (int16_t)((sgprvec[gaussian] * deltaS[gaussian]) >> 11);
    335          // Q14 * Q15 >> 21 = Q8.
    336          tmp_s16 = (int16_t)((delt * kSpeechUpdateConst) >> 21);
    337          // Q7 + (Q8 >> 1) = Q7. With rounding.
    338          smk2 = smk + ((tmp_s16 + 1) >> 1);
    339 
    340          // Control that the speech mean does not drift to much.
    341          maxmu = maxspe + 640;
    342          if (smk2 < kMinimumMean[k]) {
    343            smk2 = kMinimumMean[k];
    344          }
    345          if (smk2 > maxmu) {
    346            smk2 = maxmu;
    347          }
    348          self->speech_means[gaussian] = smk2;  // Q7.
    349 
    350          // (Q7 >> 3) = Q4. With rounding.
    351          tmp_s16 = ((smk + 4) >> 3);
    352 
    353          tmp_s16 = features[channel] - tmp_s16;  // Q4
    354          // (Q11 * Q4 >> 3) = Q12.
    355          tmp1_s32 = (deltaS[gaussian] * tmp_s16) >> 3;
    356          tmp2_s32 = tmp1_s32 - 4096;
    357          tmp_s16 = sgprvec[gaussian] >> 2;
    358          // (Q14 >> 2) * Q12 = Q24.
    359          tmp1_s32 = tmp_s16 * tmp2_s32;
    360 
    361          tmp2_s32 = tmp1_s32 >> 4;  // Q20
    362 
    363          // 0.1 * Q20 / Q7 = Q13.
    364          if (tmp2_s32 > 0) {
    365            tmp_s16 = (int16_t)WebRtcSpl_DivW32W16(tmp2_s32, ssk * 10);
    366          } else {
    367            tmp_s16 = (int16_t)WebRtcSpl_DivW32W16(-tmp2_s32, ssk * 10);
    368            tmp_s16 = -tmp_s16;
    369          }
    370          // Divide by 4 giving an update factor of 0.025 (= 0.1 / 4).
    371          // Note that division by 4 equals shift by 2, hence,
    372          // (Q13 >> 8) = (Q13 >> 6) / 4 = Q7.
    373          tmp_s16 += 128;  // Rounding.
    374          ssk += (tmp_s16 >> 8);
    375          if (ssk < kMinStd) {
    376            ssk = kMinStd;
    377          }
    378          self->speech_stds[gaussian] = ssk;
    379        } else {
    380          // Update GMM variance vectors.
    381          // deltaN * (features[channel] - nmk) - 1
    382          // Q4 - (Q7 >> 3) = Q4.
    383          tmp_s16 = features[channel] - (nmk >> 3);
    384          // (Q11 * Q4 >> 3) = Q12.
    385          tmp1_s32 = (deltaN[gaussian] * tmp_s16) >> 3;
    386          tmp1_s32 -= 4096;
    387 
    388          // (Q14 >> 2) * Q12 = Q24.
    389          tmp_s16 = (ngprvec[gaussian] + 2) >> 2;
    390          tmp2_s32 = OverflowingMulS16ByS32ToS32(tmp_s16, tmp1_s32);
    391          // Q20  * approx 0.001 (2^-10=0.0009766), hence,
    392          // (Q24 >> 14) = (Q24 >> 4) / 2^10 = Q20.
    393          tmp1_s32 = tmp2_s32 >> 14;
    394 
    395          // Q20 / Q7 = Q13.
    396          if (tmp1_s32 > 0) {
    397            tmp_s16 = (int16_t)WebRtcSpl_DivW32W16(tmp1_s32, nsk);
    398          } else {
    399            tmp_s16 = (int16_t)WebRtcSpl_DivW32W16(-tmp1_s32, nsk);
    400            tmp_s16 = -tmp_s16;
    401          }
    402          tmp_s16 += 32;        // Rounding
    403          nsk += tmp_s16 >> 6;  // Q13 >> 6 = Q7.
    404          if (nsk < kMinStd) {
    405            nsk = kMinStd;
    406          }
    407          self->noise_stds[gaussian] = nsk;
    408        }
    409      }
    410 
    411      // Separate models if they are too close.
    412      // `noise_global_mean` in Q14 (= Q7 * Q7).
    413      noise_global_mean = WeightedAverage(&self->noise_means[channel], 0,
    414                                          &kNoiseDataWeights[channel]);
    415 
    416      // `speech_global_mean` in Q14 (= Q7 * Q7).
    417      speech_global_mean = WeightedAverage(&self->speech_means[channel], 0,
    418                                           &kSpeechDataWeights[channel]);
    419 
    420      // `diff` = "global" speech mean - "global" noise mean.
    421      // (Q14 >> 9) - (Q14 >> 9) = Q5.
    422      diff = (int16_t)(speech_global_mean >> 9) -
    423             (int16_t)(noise_global_mean >> 9);
    424      if (diff < kMinimumDifference[channel]) {
    425        tmp_s16 = kMinimumDifference[channel] - diff;
    426 
    427        // `tmp1_s16` = ~0.8 * (kMinimumDifference - diff) in Q7.
    428        // `tmp2_s16` = ~0.2 * (kMinimumDifference - diff) in Q7.
    429        tmp1_s16 = (int16_t)((13 * tmp_s16) >> 2);
    430        tmp2_s16 = (int16_t)((3 * tmp_s16) >> 2);
    431 
    432        // Move Gaussian means for speech model by `tmp1_s16` and update
    433        // `speech_global_mean`. Note that `self->speech_means[channel]` is
    434        // changed after the call.
    435        speech_global_mean =
    436            WeightedAverage(&self->speech_means[channel], tmp1_s16,
    437                            &kSpeechDataWeights[channel]);
    438 
    439        // Move Gaussian means for noise model by -`tmp2_s16` and update
    440        // `noise_global_mean`. Note that `self->noise_means[channel]` is
    441        // changed after the call.
    442        noise_global_mean =
    443            WeightedAverage(&self->noise_means[channel], -tmp2_s16,
    444                            &kNoiseDataWeights[channel]);
    445      }
    446 
    447      // Control that the speech & noise means do not drift to much.
    448      maxspe = kMaximumSpeech[channel];
    449      tmp2_s16 = (int16_t)(speech_global_mean >> 7);
    450      if (tmp2_s16 > maxspe) {
    451        // Upper limit of speech model.
    452        tmp2_s16 -= maxspe;
    453 
    454        for (k = 0; k < kNumGaussians; k++) {
    455          self->speech_means[channel + k * kNumChannels] -= tmp2_s16;
    456        }
    457      }
    458 
    459      tmp2_s16 = (int16_t)(noise_global_mean >> 7);
    460      if (tmp2_s16 > kMaximumNoise[channel]) {
    461        tmp2_s16 -= kMaximumNoise[channel];
    462 
    463        for (k = 0; k < kNumGaussians; k++) {
    464          self->noise_means[channel + k * kNumChannels] -= tmp2_s16;
    465        }
    466      }
    467    }
    468    self->frame_counter++;
    469  }
    470 
    471  // Smooth with respect to transition hysteresis.
    472  if (!vadflag) {
    473    if (self->over_hang > 0) {
    474      vadflag = 2 + self->over_hang;
    475      self->over_hang--;
    476    }
    477    self->num_of_speech = 0;
    478  } else {
    479    self->num_of_speech++;
    480    if (self->num_of_speech > kMaxSpeechFrames) {
    481      self->num_of_speech = kMaxSpeechFrames;
    482      self->over_hang = overhead2;
    483    } else {
    484      self->over_hang = overhead1;
    485    }
    486  }
    487  return vadflag;
    488 }
    489 
    490 // Initialize the VAD. Set aggressiveness mode to default value.
    491 int WebRtcVad_InitCore(VadInstT* self) {
    492  int i;
    493 
    494  if (self == NULL) {
    495    return -1;
    496  }
    497 
    498  // Initialization of general struct variables.
    499  self->vad = 1;  // Speech active (=1).
    500  self->frame_counter = 0;
    501  self->over_hang = 0;
    502  self->num_of_speech = 0;
    503 
    504  // Initialization of downsampling filter state.
    505  memset(self->downsampling_filter_states, 0,
    506         sizeof(self->downsampling_filter_states));
    507 
    508  // Initialization of 48 to 8 kHz downsampling.
    509  WebRtcSpl_ResetResample48khzTo8khz(&self->state_48_to_8);
    510 
    511  // Read initial PDF parameters.
    512  for (i = 0; i < kTableSize; i++) {
    513    self->noise_means[i] = kNoiseDataMeans[i];
    514    self->speech_means[i] = kSpeechDataMeans[i];
    515    self->noise_stds[i] = kNoiseDataStds[i];
    516    self->speech_stds[i] = kSpeechDataStds[i];
    517  }
    518 
    519  // Initialize Index and Minimum value vectors.
    520  for (i = 0; i < 16 * kNumChannels; i++) {
    521    self->low_value_vector[i] = 10000;
    522    self->index_vector[i] = 0;
    523  }
    524 
    525  // Initialize splitting filter states.
    526  memset(self->upper_state, 0, sizeof(self->upper_state));
    527  memset(self->lower_state, 0, sizeof(self->lower_state));
    528 
    529  // Initialize high pass filter states.
    530  memset(self->hp_filter_state, 0, sizeof(self->hp_filter_state));
    531 
    532  // Initialize mean value memory, for WebRtcVad_FindMinimum().
    533  for (i = 0; i < kNumChannels; i++) {
    534    self->mean_value[i] = 1600;
    535  }
    536 
    537  // Set aggressiveness mode to default (=`kDefaultMode`).
    538  if (WebRtcVad_set_mode_core(self, kDefaultMode) != 0) {
    539    return -1;
    540  }
    541 
    542  self->init_flag = kInitCheck;
    543 
    544  return 0;
    545 }
    546 
    547 // Set aggressiveness mode
    548 int WebRtcVad_set_mode_core(VadInstT* self, int mode) {
    549  int return_value = 0;
    550 
    551  switch (mode) {
    552    case 0:
    553      // Quality mode.
    554      memcpy(self->over_hang_max_1, kOverHangMax1Q,
    555             sizeof(self->over_hang_max_1));
    556      memcpy(self->over_hang_max_2, kOverHangMax2Q,
    557             sizeof(self->over_hang_max_2));
    558      memcpy(self->individual, kLocalThresholdQ, sizeof(self->individual));
    559      memcpy(self->total, kGlobalThresholdQ, sizeof(self->total));
    560      break;
    561    case 1:
    562      // Low bitrate mode.
    563      memcpy(self->over_hang_max_1, kOverHangMax1LBR,
    564             sizeof(self->over_hang_max_1));
    565      memcpy(self->over_hang_max_2, kOverHangMax2LBR,
    566             sizeof(self->over_hang_max_2));
    567      memcpy(self->individual, kLocalThresholdLBR, sizeof(self->individual));
    568      memcpy(self->total, kGlobalThresholdLBR, sizeof(self->total));
    569      break;
    570    case 2:
    571      // Aggressive mode.
    572      memcpy(self->over_hang_max_1, kOverHangMax1AGG,
    573             sizeof(self->over_hang_max_1));
    574      memcpy(self->over_hang_max_2, kOverHangMax2AGG,
    575             sizeof(self->over_hang_max_2));
    576      memcpy(self->individual, kLocalThresholdAGG, sizeof(self->individual));
    577      memcpy(self->total, kGlobalThresholdAGG, sizeof(self->total));
    578      break;
    579    case 3:
    580      // Very aggressive mode.
    581      memcpy(self->over_hang_max_1, kOverHangMax1VAG,
    582             sizeof(self->over_hang_max_1));
    583      memcpy(self->over_hang_max_2, kOverHangMax2VAG,
    584             sizeof(self->over_hang_max_2));
    585      memcpy(self->individual, kLocalThresholdVAG, sizeof(self->individual));
    586      memcpy(self->total, kGlobalThresholdVAG, sizeof(self->total));
    587      break;
    588    default:
    589      return_value = -1;
    590      break;
    591  }
    592 
    593  return return_value;
    594 }
    595 
    596 // Calculate VAD decision by first extracting feature values and then calculate
    597 // probability for both speech and background noise.
    598 
    599 int WebRtcVad_CalcVad48khz(VadInstT* inst,
    600                           const int16_t* speech_frame,
    601                           size_t frame_length) {
    602  int vad;
    603  size_t i;
    604  int16_t speech_nb[240];  // 30 ms in 8 kHz.
    605  // `tmp_mem` is a temporary memory used by resample function, length is
    606  // frame length in 10 ms (480 samples) + 256 extra.
    607  int32_t tmp_mem[480 + 256] = {0};
    608  const size_t kFrameLen10ms48khz = 480;
    609  const size_t kFrameLen10ms8khz = 80;
    610  size_t num_10ms_frames = frame_length / kFrameLen10ms48khz;
    611 
    612  for (i = 0; i < num_10ms_frames; i++) {
    613    WebRtcSpl_Resample48khzTo8khz(speech_frame,
    614                                  &speech_nb[i * kFrameLen10ms8khz],
    615                                  &inst->state_48_to_8, tmp_mem);
    616  }
    617 
    618  // Do VAD on an 8 kHz signal
    619  vad = WebRtcVad_CalcVad8khz(inst, speech_nb, frame_length / 6);
    620 
    621  return vad;
    622 }
    623 
    624 int WebRtcVad_CalcVad32khz(VadInstT* inst,
    625                           const int16_t* speech_frame,
    626                           size_t frame_length) {
    627  size_t len;
    628  int vad;
    629  int16_t speechWB[480];  // Downsampled speech frame: 960 samples (30ms in SWB)
    630  int16_t speechNB[240];  // Downsampled speech frame: 480 samples (30ms in WB)
    631 
    632  // Downsample signal 32->16->8 before doing VAD
    633  WebRtcVad_Downsampling(speech_frame, speechWB,
    634                         &(inst->downsampling_filter_states[2]), frame_length);
    635  len = frame_length / 2;
    636 
    637  WebRtcVad_Downsampling(speechWB, speechNB, inst->downsampling_filter_states,
    638                         len);
    639  len /= 2;
    640 
    641  // Do VAD on an 8 kHz signal
    642  vad = WebRtcVad_CalcVad8khz(inst, speechNB, len);
    643 
    644  return vad;
    645 }
    646 
    647 int WebRtcVad_CalcVad16khz(VadInstT* inst,
    648                           const int16_t* speech_frame,
    649                           size_t frame_length) {
    650  size_t len;
    651  int vad;
    652  int16_t speechNB[240];  // Downsampled speech frame: 480 samples (30ms in WB)
    653 
    654  // Wideband: Downsample signal before doing VAD
    655  WebRtcVad_Downsampling(speech_frame, speechNB,
    656                         inst->downsampling_filter_states, frame_length);
    657 
    658  len = frame_length / 2;
    659  vad = WebRtcVad_CalcVad8khz(inst, speechNB, len);
    660 
    661  return vad;
    662 }
    663 
    664 int WebRtcVad_CalcVad8khz(VadInstT* inst,
    665                          const int16_t* speech_frame,
    666                          size_t frame_length) {
    667  int16_t feature_vector[kNumChannels], total_power;
    668 
    669  // Get power in the bands
    670  total_power = WebRtcVad_CalculateFeatures(inst, speech_frame, frame_length,
    671                                            feature_vector);
    672 
    673  // Make a VAD
    674  inst->vad = GmmProbability(inst, feature_vector, total_power, frame_length);
    675 
    676  return inst->vad;
    677 }
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE