vad_audio_proc.cc (10899B)
1 /* 2 * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "modules/audio_processing/vad/vad_audio_proc.h" 12 13 #include <cmath> 14 #include <cstdint> 15 #include <cstdio> 16 #include <cstring> 17 18 #include "common_audio/third_party/ooura/fft_size_256/fft4g.h" 19 #include "modules/audio_processing/vad/common.h" 20 #include "modules/audio_processing/vad/pitch_internal.h" 21 #include "modules/audio_processing/vad/pole_zero_filter.h" 22 #include "modules/audio_processing/vad/vad_audio_proc_internal.h" 23 #include "rtc_base/checks.h" 24 extern "C" { 25 #include "modules/audio_coding/codecs/isac/main/source/filter_functions.h" 26 #include "modules/audio_coding/codecs/isac/main/source/isac_vad.h" 27 #include "modules/audio_coding/codecs/isac/main/source/pitch_estimator.h" 28 #include "modules/audio_coding/codecs/isac/main/source/structs.h" 29 } 30 31 namespace webrtc { 32 33 // The following structures are declared anonymous in iSAC's structs.h. To 34 // forward declare them, we use this derived class trick. 35 struct VadAudioProc::PitchAnalysisStruct : public ::PitchAnalysisStruct {}; 36 struct VadAudioProc::PreFiltBankstr : public ::PreFiltBankstr {}; 37 38 static constexpr float kFrequencyResolution = 39 kSampleRateHz / static_cast<float>(VadAudioProc::kDftSize); 40 static constexpr int kSilenceRms = 5; 41 42 // TODO(turajs): Make a Create or Init for VadAudioProc. 43 VadAudioProc::VadAudioProc() 44 : audio_buffer_(), 45 num_buffer_samples_(kNumPastSignalSamples), 46 log_old_gain_(-2), 47 old_lag_(50), // Arbitrary but valid as pitch-lag (in samples). 48 pitch_analysis_handle_(new PitchAnalysisStruct), 49 pre_filter_handle_(new PreFiltBankstr), 50 high_pass_filter_(PoleZeroFilter::Create(kCoeffNumerator, 51 kFilterOrder, 52 kCoeffDenominator, 53 kFilterOrder)) { 54 static_assert(kNumPastSignalSamples + kNumSubframeSamples == 55 sizeof(kLpcAnalWin) / sizeof(kLpcAnalWin[0]), 56 "lpc analysis window incorrect size"); 57 static_assert(kLpcOrder + 1 == sizeof(kCorrWeight) / sizeof(kCorrWeight[0]), 58 "correlation weight incorrect size"); 59 60 // TODO(turajs): Are we doing too much in the constructor? 61 float data[kDftSize]; 62 // Make FFT to initialize. 63 ip_[0] = 0; 64 WebRtc_rdft(kDftSize, 1, data, ip_, w_fft_); 65 // TODO(turajs): Need to initialize high-pass filter. 66 67 // Initialize iSAC components. 68 WebRtcIsac_InitPreFilterbank(pre_filter_handle_.get()); 69 WebRtcIsac_InitPitchAnalysis(pitch_analysis_handle_.get()); 70 } 71 72 VadAudioProc::~VadAudioProc() {} 73 74 void VadAudioProc::ResetBuffer() { 75 memcpy(audio_buffer_, &audio_buffer_[kNumSamplesToProcess], 76 sizeof(audio_buffer_[0]) * kNumPastSignalSamples); 77 num_buffer_samples_ = kNumPastSignalSamples; 78 } 79 80 int VadAudioProc::ExtractFeatures(const int16_t* frame, 81 size_t length, 82 AudioFeatures* features) { 83 features->num_frames = 0; 84 if (length != kNumSubframeSamples) { 85 return -1; 86 } 87 88 // High-pass filter to remove the DC component and very low frequency content. 89 // We have experienced that this high-pass filtering improves voice/non-voiced 90 // classification. 91 if (high_pass_filter_->Filter(frame, kNumSubframeSamples, 92 &audio_buffer_[num_buffer_samples_]) != 0) { 93 return -1; 94 } 95 96 num_buffer_samples_ += kNumSubframeSamples; 97 if (num_buffer_samples_ < kBufferLength) { 98 return 0; 99 } 100 RTC_DCHECK_EQ(num_buffer_samples_, kBufferLength); 101 features->num_frames = kNum10msSubframes; 102 features->silence = false; 103 104 Rms(features->rms, kMaxNumFrames); 105 for (size_t i = 0; i < kNum10msSubframes; ++i) { 106 if (features->rms[i] < kSilenceRms) { 107 // PitchAnalysis can cause NaNs in the pitch gain if it's fed silence. 108 // Bail out here instead. 109 features->silence = true; 110 ResetBuffer(); 111 return 0; 112 } 113 } 114 115 PitchAnalysis(features->log_pitch_gain, features->pitch_lag_hz, 116 kMaxNumFrames); 117 FindFirstSpectralPeaks(features->spectral_peak, kMaxNumFrames); 118 ResetBuffer(); 119 return 0; 120 } 121 122 // Computes |kLpcOrder + 1| correlation coefficients. 123 void VadAudioProc::SubframeCorrelation(double* corr, 124 size_t length_corr, 125 size_t subframe_index) { 126 RTC_DCHECK_GE(length_corr, kLpcOrder + 1); 127 double windowed_audio[kNumSubframeSamples + kNumPastSignalSamples]; 128 size_t buffer_index = subframe_index * kNumSubframeSamples; 129 130 for (size_t n = 0; n < kNumSubframeSamples + kNumPastSignalSamples; n++) 131 windowed_audio[n] = audio_buffer_[buffer_index++] * kLpcAnalWin[n]; 132 133 WebRtcIsac_AutoCorr(corr, windowed_audio, 134 kNumSubframeSamples + kNumPastSignalSamples, kLpcOrder); 135 } 136 137 // Compute `kNum10msSubframes` sets of LPC coefficients, one per 10 ms input. 138 // The analysis window is 15 ms long and it is centered on the first half of 139 // each 10ms sub-frame. This is equivalent to computing LPC coefficients for the 140 // first half of each 10 ms subframe. 141 void VadAudioProc::GetLpcPolynomials(double* lpc, size_t length_lpc) { 142 RTC_DCHECK_GE(length_lpc, kNum10msSubframes * (kLpcOrder + 1)); 143 double corr[kLpcOrder + 1]; 144 double reflec_coeff[kLpcOrder]; 145 for (size_t i = 0, offset_lpc = 0; i < kNum10msSubframes; 146 i++, offset_lpc += kLpcOrder + 1) { 147 SubframeCorrelation(corr, kLpcOrder + 1, i); 148 corr[0] *= 1.0001; 149 // This makes Lev-Durb a bit more stable. 150 for (size_t k = 0; k < kLpcOrder + 1; k++) { 151 corr[k] *= kCorrWeight[k]; 152 } 153 WebRtcIsac_LevDurb(&lpc[offset_lpc], reflec_coeff, corr, kLpcOrder); 154 } 155 } 156 157 // Fit a second order curve to these 3 points and find the location of the 158 // extremum. The points are inverted before curve fitting. 159 static float QuadraticInterpolation(float prev_val, 160 float curr_val, 161 float next_val) { 162 // Doing the interpolation in |1 / A(z)|^2. 163 float fractional_index = 0; 164 next_val = 1.0f / next_val; 165 prev_val = 1.0f / prev_val; 166 curr_val = 1.0f / curr_val; 167 168 fractional_index = 169 -(next_val - prev_val) * 0.5f / (next_val + prev_val - 2.f * curr_val); 170 RTC_DCHECK_LT(fabs(fractional_index), 1); 171 return fractional_index; 172 } 173 174 // 1 / A(z), where A(z) is defined by `lpc` is a model of the spectral envelope 175 // of the input signal. The local maximum of the spectral envelope corresponds 176 // with the local minimum of A(z). It saves complexity, as we save one 177 // inversion. Furthermore, we find the first local maximum of magnitude squared, 178 // to save on one square root. 179 void VadAudioProc::FindFirstSpectralPeaks(double* f_peak, 180 size_t length_f_peak) { 181 RTC_DCHECK_GE(length_f_peak, kNum10msSubframes); 182 double lpc[kNum10msSubframes * (kLpcOrder + 1)]; 183 // For all sub-frames. 184 GetLpcPolynomials(lpc, kNum10msSubframes * (kLpcOrder + 1)); 185 186 const size_t kNumDftCoefficients = kDftSize / 2 + 1; 187 float data[kDftSize]; 188 189 for (size_t i = 0; i < kNum10msSubframes; i++) { 190 // Convert to float with zero pad. 191 memset(data, 0, sizeof(data)); 192 for (size_t n = 0; n < kLpcOrder + 1; n++) { 193 data[n] = static_cast<float>(lpc[i * (kLpcOrder + 1) + n]); 194 } 195 // Transform to frequency domain. 196 WebRtc_rdft(kDftSize, 1, data, ip_, w_fft_); 197 198 size_t index_peak = 0; 199 float prev_magn_sqr = data[0] * data[0]; 200 float curr_magn_sqr = data[2] * data[2] + data[3] * data[3]; 201 float next_magn_sqr; 202 bool found_peak = false; 203 for (size_t n = 2; n < kNumDftCoefficients - 1; n++) { 204 next_magn_sqr = 205 data[2 * n] * data[2 * n] + data[2 * n + 1] * data[2 * n + 1]; 206 if (curr_magn_sqr < prev_magn_sqr && curr_magn_sqr < next_magn_sqr) { 207 found_peak = true; 208 index_peak = n - 1; 209 break; 210 } 211 prev_magn_sqr = curr_magn_sqr; 212 curr_magn_sqr = next_magn_sqr; 213 } 214 float fractional_index = 0; 215 if (!found_peak) { 216 // Checking if |kNumDftCoefficients - 1| is the local minimum. 217 next_magn_sqr = data[1] * data[1]; 218 if (curr_magn_sqr < prev_magn_sqr && curr_magn_sqr < next_magn_sqr) { 219 index_peak = kNumDftCoefficients - 1; 220 } 221 } else { 222 // A peak is found, do a simple quadratic interpolation to get a more 223 // accurate estimate of the peak location. 224 fractional_index = 225 QuadraticInterpolation(prev_magn_sqr, curr_magn_sqr, next_magn_sqr); 226 } 227 f_peak[i] = (index_peak + fractional_index) * kFrequencyResolution; 228 } 229 } 230 231 // Using iSAC functions to estimate pitch gains & lags. 232 void VadAudioProc::PitchAnalysis(double* log_pitch_gains, 233 double* pitch_lags_hz, 234 size_t length) { 235 // TODO(turajs): This can be "imported" from iSAC & and the next two 236 // constants. 237 RTC_DCHECK_GE(length, kNum10msSubframes); 238 const int kNumPitchSubframes = 4; 239 double gains[kNumPitchSubframes]; 240 double lags[kNumPitchSubframes]; 241 242 const int kNumSubbandFrameSamples = 240; 243 const int kNumLookaheadSamples = 24; 244 245 float lower[kNumSubbandFrameSamples]; 246 float upper[kNumSubbandFrameSamples]; 247 double lower_lookahead[kNumSubbandFrameSamples]; 248 double upper_lookahead[kNumSubbandFrameSamples]; 249 double lower_lookahead_pre_filter[kNumSubbandFrameSamples + 250 kNumLookaheadSamples]; 251 252 // Split signal to lower and upper bands 253 WebRtcIsac_SplitAndFilterFloat(&audio_buffer_[kNumPastSignalSamples], lower, 254 upper, lower_lookahead, upper_lookahead, 255 pre_filter_handle_.get()); 256 WebRtcIsac_PitchAnalysis(lower_lookahead, lower_lookahead_pre_filter, 257 pitch_analysis_handle_.get(), lags, gains); 258 259 // Lags are computed on lower-band signal with sampling rate half of the 260 // input signal. 261 GetSubframesPitchParameters( 262 kSampleRateHz / 2, gains, lags, kNumPitchSubframes, kNum10msSubframes, 263 &log_old_gain_, &old_lag_, log_pitch_gains, pitch_lags_hz); 264 } 265 266 void VadAudioProc::Rms(double* rms, size_t length_rms) { 267 RTC_DCHECK_GE(length_rms, kNum10msSubframes); 268 size_t offset = kNumPastSignalSamples; 269 for (size_t i = 0; i < kNum10msSubframes; i++) { 270 rms[i] = 0; 271 for (size_t n = 0; n < kNumSubframeSamples; n++, offset++) 272 rms[i] += audio_buffer_[offset] * audio_buffer_[offset]; 273 rms[i] = sqrt(rms[i] / kNumSubframeSamples); 274 } 275 } 276 277 } // namespace webrtc