tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

vad_audio_proc.cc (10899B)


      1 /*
      2 *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
      3 *
      4 *  Use of this source code is governed by a BSD-style license
      5 *  that can be found in the LICENSE file in the root of the source
      6 *  tree. An additional intellectual property rights grant can be found
      7 *  in the file PATENTS.  All contributing project authors may
      8 *  be found in the AUTHORS file in the root of the source tree.
      9 */
     10 
     11 #include "modules/audio_processing/vad/vad_audio_proc.h"
     12 
     13 #include <cmath>
     14 #include <cstdint>
     15 #include <cstdio>
     16 #include <cstring>
     17 
     18 #include "common_audio/third_party/ooura/fft_size_256/fft4g.h"
     19 #include "modules/audio_processing/vad/common.h"
     20 #include "modules/audio_processing/vad/pitch_internal.h"
     21 #include "modules/audio_processing/vad/pole_zero_filter.h"
     22 #include "modules/audio_processing/vad/vad_audio_proc_internal.h"
     23 #include "rtc_base/checks.h"
     24 extern "C" {
     25 #include "modules/audio_coding/codecs/isac/main/source/filter_functions.h"
     26 #include "modules/audio_coding/codecs/isac/main/source/isac_vad.h"
     27 #include "modules/audio_coding/codecs/isac/main/source/pitch_estimator.h"
     28 #include "modules/audio_coding/codecs/isac/main/source/structs.h"
     29 }
     30 
     31 namespace webrtc {
     32 
     33 // The following structures are declared anonymous in iSAC's structs.h. To
     34 // forward declare them, we use this derived class trick.
     35 struct VadAudioProc::PitchAnalysisStruct : public ::PitchAnalysisStruct {};
     36 struct VadAudioProc::PreFiltBankstr : public ::PreFiltBankstr {};
     37 
     38 static constexpr float kFrequencyResolution =
     39    kSampleRateHz / static_cast<float>(VadAudioProc::kDftSize);
     40 static constexpr int kSilenceRms = 5;
     41 
     42 // TODO(turajs): Make a Create or Init for VadAudioProc.
     43 VadAudioProc::VadAudioProc()
     44    : audio_buffer_(),
     45      num_buffer_samples_(kNumPastSignalSamples),
     46      log_old_gain_(-2),
     47      old_lag_(50),  // Arbitrary but valid as pitch-lag (in samples).
     48      pitch_analysis_handle_(new PitchAnalysisStruct),
     49      pre_filter_handle_(new PreFiltBankstr),
     50      high_pass_filter_(PoleZeroFilter::Create(kCoeffNumerator,
     51                                               kFilterOrder,
     52                                               kCoeffDenominator,
     53                                               kFilterOrder)) {
     54  static_assert(kNumPastSignalSamples + kNumSubframeSamples ==
     55                    sizeof(kLpcAnalWin) / sizeof(kLpcAnalWin[0]),
     56                "lpc analysis window incorrect size");
     57  static_assert(kLpcOrder + 1 == sizeof(kCorrWeight) / sizeof(kCorrWeight[0]),
     58                "correlation weight incorrect size");
     59 
     60  // TODO(turajs): Are we doing too much in the constructor?
     61  float data[kDftSize];
     62  // Make FFT to initialize.
     63  ip_[0] = 0;
     64  WebRtc_rdft(kDftSize, 1, data, ip_, w_fft_);
     65  // TODO(turajs): Need to initialize high-pass filter.
     66 
     67  // Initialize iSAC components.
     68  WebRtcIsac_InitPreFilterbank(pre_filter_handle_.get());
     69  WebRtcIsac_InitPitchAnalysis(pitch_analysis_handle_.get());
     70 }
     71 
     72 VadAudioProc::~VadAudioProc() {}
     73 
     74 void VadAudioProc::ResetBuffer() {
     75  memcpy(audio_buffer_, &audio_buffer_[kNumSamplesToProcess],
     76         sizeof(audio_buffer_[0]) * kNumPastSignalSamples);
     77  num_buffer_samples_ = kNumPastSignalSamples;
     78 }
     79 
     80 int VadAudioProc::ExtractFeatures(const int16_t* frame,
     81                                  size_t length,
     82                                  AudioFeatures* features) {
     83  features->num_frames = 0;
     84  if (length != kNumSubframeSamples) {
     85    return -1;
     86  }
     87 
     88  // High-pass filter to remove the DC component and very low frequency content.
     89  // We have experienced that this high-pass filtering improves voice/non-voiced
     90  // classification.
     91  if (high_pass_filter_->Filter(frame, kNumSubframeSamples,
     92                                &audio_buffer_[num_buffer_samples_]) != 0) {
     93    return -1;
     94  }
     95 
     96  num_buffer_samples_ += kNumSubframeSamples;
     97  if (num_buffer_samples_ < kBufferLength) {
     98    return 0;
     99  }
    100  RTC_DCHECK_EQ(num_buffer_samples_, kBufferLength);
    101  features->num_frames = kNum10msSubframes;
    102  features->silence = false;
    103 
    104  Rms(features->rms, kMaxNumFrames);
    105  for (size_t i = 0; i < kNum10msSubframes; ++i) {
    106    if (features->rms[i] < kSilenceRms) {
    107      // PitchAnalysis can cause NaNs in the pitch gain if it's fed silence.
    108      // Bail out here instead.
    109      features->silence = true;
    110      ResetBuffer();
    111      return 0;
    112    }
    113  }
    114 
    115  PitchAnalysis(features->log_pitch_gain, features->pitch_lag_hz,
    116                kMaxNumFrames);
    117  FindFirstSpectralPeaks(features->spectral_peak, kMaxNumFrames);
    118  ResetBuffer();
    119  return 0;
    120 }
    121 
    122 // Computes |kLpcOrder + 1| correlation coefficients.
    123 void VadAudioProc::SubframeCorrelation(double* corr,
    124                                       size_t length_corr,
    125                                       size_t subframe_index) {
    126  RTC_DCHECK_GE(length_corr, kLpcOrder + 1);
    127  double windowed_audio[kNumSubframeSamples + kNumPastSignalSamples];
    128  size_t buffer_index = subframe_index * kNumSubframeSamples;
    129 
    130  for (size_t n = 0; n < kNumSubframeSamples + kNumPastSignalSamples; n++)
    131    windowed_audio[n] = audio_buffer_[buffer_index++] * kLpcAnalWin[n];
    132 
    133  WebRtcIsac_AutoCorr(corr, windowed_audio,
    134                      kNumSubframeSamples + kNumPastSignalSamples, kLpcOrder);
    135 }
    136 
    137 // Compute `kNum10msSubframes` sets of LPC coefficients, one per 10 ms input.
    138 // The analysis window is 15 ms long and it is centered on the first half of
    139 // each 10ms sub-frame. This is equivalent to computing LPC coefficients for the
    140 // first half of each 10 ms subframe.
    141 void VadAudioProc::GetLpcPolynomials(double* lpc, size_t length_lpc) {
    142  RTC_DCHECK_GE(length_lpc, kNum10msSubframes * (kLpcOrder + 1));
    143  double corr[kLpcOrder + 1];
    144  double reflec_coeff[kLpcOrder];
    145  for (size_t i = 0, offset_lpc = 0; i < kNum10msSubframes;
    146       i++, offset_lpc += kLpcOrder + 1) {
    147    SubframeCorrelation(corr, kLpcOrder + 1, i);
    148    corr[0] *= 1.0001;
    149    // This makes Lev-Durb a bit more stable.
    150    for (size_t k = 0; k < kLpcOrder + 1; k++) {
    151      corr[k] *= kCorrWeight[k];
    152    }
    153    WebRtcIsac_LevDurb(&lpc[offset_lpc], reflec_coeff, corr, kLpcOrder);
    154  }
    155 }
    156 
    157 // Fit a second order curve to these 3 points and find the location of the
    158 // extremum. The points are inverted before curve fitting.
    159 static float QuadraticInterpolation(float prev_val,
    160                                    float curr_val,
    161                                    float next_val) {
    162  // Doing the interpolation in |1 / A(z)|^2.
    163  float fractional_index = 0;
    164  next_val = 1.0f / next_val;
    165  prev_val = 1.0f / prev_val;
    166  curr_val = 1.0f / curr_val;
    167 
    168  fractional_index =
    169      -(next_val - prev_val) * 0.5f / (next_val + prev_val - 2.f * curr_val);
    170  RTC_DCHECK_LT(fabs(fractional_index), 1);
    171  return fractional_index;
    172 }
    173 
    174 // 1 / A(z), where A(z) is defined by `lpc` is a model of the spectral envelope
    175 // of the input signal. The local maximum of the spectral envelope corresponds
    176 // with the local minimum of A(z). It saves complexity, as we save one
    177 // inversion. Furthermore, we find the first local maximum of magnitude squared,
    178 // to save on one square root.
    179 void VadAudioProc::FindFirstSpectralPeaks(double* f_peak,
    180                                          size_t length_f_peak) {
    181  RTC_DCHECK_GE(length_f_peak, kNum10msSubframes);
    182  double lpc[kNum10msSubframes * (kLpcOrder + 1)];
    183  // For all sub-frames.
    184  GetLpcPolynomials(lpc, kNum10msSubframes * (kLpcOrder + 1));
    185 
    186  const size_t kNumDftCoefficients = kDftSize / 2 + 1;
    187  float data[kDftSize];
    188 
    189  for (size_t i = 0; i < kNum10msSubframes; i++) {
    190    // Convert to float with zero pad.
    191    memset(data, 0, sizeof(data));
    192    for (size_t n = 0; n < kLpcOrder + 1; n++) {
    193      data[n] = static_cast<float>(lpc[i * (kLpcOrder + 1) + n]);
    194    }
    195    // Transform to frequency domain.
    196    WebRtc_rdft(kDftSize, 1, data, ip_, w_fft_);
    197 
    198    size_t index_peak = 0;
    199    float prev_magn_sqr = data[0] * data[0];
    200    float curr_magn_sqr = data[2] * data[2] + data[3] * data[3];
    201    float next_magn_sqr;
    202    bool found_peak = false;
    203    for (size_t n = 2; n < kNumDftCoefficients - 1; n++) {
    204      next_magn_sqr =
    205          data[2 * n] * data[2 * n] + data[2 * n + 1] * data[2 * n + 1];
    206      if (curr_magn_sqr < prev_magn_sqr && curr_magn_sqr < next_magn_sqr) {
    207        found_peak = true;
    208        index_peak = n - 1;
    209        break;
    210      }
    211      prev_magn_sqr = curr_magn_sqr;
    212      curr_magn_sqr = next_magn_sqr;
    213    }
    214    float fractional_index = 0;
    215    if (!found_peak) {
    216      // Checking if |kNumDftCoefficients - 1| is the local minimum.
    217      next_magn_sqr = data[1] * data[1];
    218      if (curr_magn_sqr < prev_magn_sqr && curr_magn_sqr < next_magn_sqr) {
    219        index_peak = kNumDftCoefficients - 1;
    220      }
    221    } else {
    222      // A peak is found, do a simple quadratic interpolation to get a more
    223      // accurate estimate of the peak location.
    224      fractional_index =
    225          QuadraticInterpolation(prev_magn_sqr, curr_magn_sqr, next_magn_sqr);
    226    }
    227    f_peak[i] = (index_peak + fractional_index) * kFrequencyResolution;
    228  }
    229 }
    230 
    231 // Using iSAC functions to estimate pitch gains & lags.
    232 void VadAudioProc::PitchAnalysis(double* log_pitch_gains,
    233                                 double* pitch_lags_hz,
    234                                 size_t length) {
    235  // TODO(turajs): This can be "imported" from iSAC & and the next two
    236  // constants.
    237  RTC_DCHECK_GE(length, kNum10msSubframes);
    238  const int kNumPitchSubframes = 4;
    239  double gains[kNumPitchSubframes];
    240  double lags[kNumPitchSubframes];
    241 
    242  const int kNumSubbandFrameSamples = 240;
    243  const int kNumLookaheadSamples = 24;
    244 
    245  float lower[kNumSubbandFrameSamples];
    246  float upper[kNumSubbandFrameSamples];
    247  double lower_lookahead[kNumSubbandFrameSamples];
    248  double upper_lookahead[kNumSubbandFrameSamples];
    249  double lower_lookahead_pre_filter[kNumSubbandFrameSamples +
    250                                    kNumLookaheadSamples];
    251 
    252  // Split signal to lower and upper bands
    253  WebRtcIsac_SplitAndFilterFloat(&audio_buffer_[kNumPastSignalSamples], lower,
    254                                 upper, lower_lookahead, upper_lookahead,
    255                                 pre_filter_handle_.get());
    256  WebRtcIsac_PitchAnalysis(lower_lookahead, lower_lookahead_pre_filter,
    257                           pitch_analysis_handle_.get(), lags, gains);
    258 
    259  // Lags are computed on lower-band signal with sampling rate half of the
    260  // input signal.
    261  GetSubframesPitchParameters(
    262      kSampleRateHz / 2, gains, lags, kNumPitchSubframes, kNum10msSubframes,
    263      &log_old_gain_, &old_lag_, log_pitch_gains, pitch_lags_hz);
    264 }
    265 
    266 void VadAudioProc::Rms(double* rms, size_t length_rms) {
    267  RTC_DCHECK_GE(length_rms, kNum10msSubframes);
    268  size_t offset = kNumPastSignalSamples;
    269  for (size_t i = 0; i < kNum10msSubframes; i++) {
    270    rms[i] = 0;
    271    for (size_t n = 0; n < kNumSubframeSamples; n++, offset++)
    272      rms[i] += audio_buffer_[offset] * audio_buffer_[offset];
    273    rms[i] = sqrt(rms[i] / kNumSubframeSamples);
    274  }
    275 }
    276 
    277 }  // namespace webrtc