spectral_features.cc (9204B)
1 /* 2 * Copyright (c) 2018 The WebRTC project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "modules/audio_processing/agc2/rnn_vad/spectral_features.h" 12 13 #include <algorithm> 14 #include <array> 15 #include <cmath> 16 #include <limits> 17 #include <numeric> 18 19 #include "api/array_view.h" 20 #include "modules/audio_processing/agc2/rnn_vad/common.h" 21 #include "modules/audio_processing/agc2/rnn_vad/ring_buffer.h" 22 #include "modules/audio_processing/agc2/rnn_vad/spectral_features_internal.h" 23 #include "modules/audio_processing/agc2/rnn_vad/symmetric_matrix_buffer.h" 24 #include "modules/audio_processing/utility/pffft_wrapper.h" 25 #include "rtc_base/checks.h" 26 #include "rtc_base/numerics/safe_compare.h" 27 28 namespace webrtc { 29 namespace rnn_vad { 30 namespace { 31 32 constexpr float kSilenceThreshold = 0.04f; 33 34 // Computes the new cepstral difference stats and pushes them into the passed 35 // symmetric matrix buffer. 36 void UpdateCepstralDifferenceStats( 37 ArrayView<const float, kNumBands> new_cepstral_coeffs, 38 const RingBuffer<float, kNumBands, kCepstralCoeffsHistorySize>& ring_buf, 39 SymmetricMatrixBuffer<float, kCepstralCoeffsHistorySize>* sym_matrix_buf) { 40 RTC_DCHECK(sym_matrix_buf); 41 // Compute the new cepstral distance stats. 42 std::array<float, kCepstralCoeffsHistorySize - 1> distances; 43 for (int i = 0; i < kCepstralCoeffsHistorySize - 1; ++i) { 44 const int delay = i + 1; 45 auto old_cepstral_coeffs = ring_buf.GetArrayView(delay); 46 distances[i] = 0.f; 47 for (int k = 0; k < kNumBands; ++k) { 48 const float c = new_cepstral_coeffs[k] - old_cepstral_coeffs[k]; 49 distances[i] += c * c; 50 } 51 } 52 // Push the new spectral distance stats into the symmetric matrix buffer. 53 sym_matrix_buf->Push(distances); 54 } 55 56 // Computes the first half of the Vorbis window. 57 std::array<float, kFrameSize20ms24kHz / 2> ComputeScaledHalfVorbisWindow( 58 float scaling = 1.f) { 59 constexpr int kHalfSize = kFrameSize20ms24kHz / 2; 60 std::array<float, kHalfSize> half_window{}; 61 for (int i = 0; i < kHalfSize; ++i) { 62 half_window[i] = 63 scaling * 64 std::sin(0.5 * kPi * std::sin(0.5 * kPi * (i + 0.5) / kHalfSize) * 65 std::sin(0.5 * kPi * (i + 0.5) / kHalfSize)); 66 } 67 return half_window; 68 } 69 70 // Computes the forward FFT on a 20 ms frame to which a given window function is 71 // applied. The Fourier coefficient corresponding to the Nyquist frequency is 72 // set to zero (it is never used and this allows to simplify the code). 73 void ComputeWindowedForwardFft( 74 ArrayView<const float, kFrameSize20ms24kHz> frame, 75 const std::array<float, kFrameSize20ms24kHz / 2>& half_window, 76 Pffft::FloatBuffer* fft_input_buffer, 77 Pffft::FloatBuffer* fft_output_buffer, 78 Pffft* fft) { 79 RTC_DCHECK_EQ(frame.size(), 2 * half_window.size()); 80 // Apply windowing. 81 auto in = fft_input_buffer->GetView(); 82 for (int i = 0, j = kFrameSize20ms24kHz - 1; SafeLt(i, half_window.size()); 83 ++i, --j) { 84 in[i] = frame[i] * half_window[i]; 85 in[j] = frame[j] * half_window[i]; 86 } 87 fft->ForwardTransform(*fft_input_buffer, fft_output_buffer, /*ordered=*/true); 88 // Set the Nyquist frequency coefficient to zero. 89 auto out = fft_output_buffer->GetView(); 90 out[1] = 0.f; 91 } 92 93 } // namespace 94 95 SpectralFeaturesExtractor::SpectralFeaturesExtractor() 96 : half_window_(ComputeScaledHalfVorbisWindow( 97 1.f / static_cast<float>(kFrameSize20ms24kHz))), 98 fft_(kFrameSize20ms24kHz, Pffft::FftType::kReal), 99 fft_buffer_(fft_.CreateBuffer()), 100 reference_frame_fft_(fft_.CreateBuffer()), 101 lagged_frame_fft_(fft_.CreateBuffer()), 102 dct_table_(ComputeDctTable()) {} 103 104 SpectralFeaturesExtractor::~SpectralFeaturesExtractor() = default; 105 106 void SpectralFeaturesExtractor::Reset() { 107 cepstral_coeffs_ring_buf_.Reset(); 108 cepstral_diffs_buf_.Reset(); 109 } 110 111 bool SpectralFeaturesExtractor::CheckSilenceComputeFeatures( 112 ArrayView<const float, kFrameSize20ms24kHz> reference_frame, 113 ArrayView<const float, kFrameSize20ms24kHz> lagged_frame, 114 ArrayView<float, kNumBands - kNumLowerBands> higher_bands_cepstrum, 115 ArrayView<float, kNumLowerBands> average, 116 ArrayView<float, kNumLowerBands> first_derivative, 117 ArrayView<float, kNumLowerBands> second_derivative, 118 ArrayView<float, kNumLowerBands> bands_cross_corr, 119 float* variability) { 120 // Compute the Opus band energies for the reference frame. 121 ComputeWindowedForwardFft(reference_frame, half_window_, fft_buffer_.get(), 122 reference_frame_fft_.get(), &fft_); 123 spectral_correlator_.ComputeAutoCorrelation( 124 reference_frame_fft_->GetConstView(), reference_frame_bands_energy_); 125 // Check if the reference frame has silence. 126 const float tot_energy = 127 std::accumulate(reference_frame_bands_energy_.begin(), 128 reference_frame_bands_energy_.end(), 0.f); 129 if (tot_energy < kSilenceThreshold) { 130 return true; 131 } 132 // Compute the Opus band energies for the lagged frame. 133 ComputeWindowedForwardFft(lagged_frame, half_window_, fft_buffer_.get(), 134 lagged_frame_fft_.get(), &fft_); 135 spectral_correlator_.ComputeAutoCorrelation(lagged_frame_fft_->GetConstView(), 136 lagged_frame_bands_energy_); 137 // Log of the band energies for the reference frame. 138 std::array<float, kNumBands> log_bands_energy; 139 ComputeSmoothedLogMagnitudeSpectrum(reference_frame_bands_energy_, 140 log_bands_energy); 141 // Reference frame cepstrum. 142 std::array<float, kNumBands> cepstrum; 143 ComputeDct(log_bands_energy, dct_table_, cepstrum); 144 // Ad-hoc correction terms for the first two cepstral coefficients. 145 cepstrum[0] -= 12.f; 146 cepstrum[1] -= 4.f; 147 // Update the ring buffer and the cepstral difference stats. 148 cepstral_coeffs_ring_buf_.Push(cepstrum); 149 UpdateCepstralDifferenceStats(cepstrum, cepstral_coeffs_ring_buf_, 150 &cepstral_diffs_buf_); 151 // Write the higher bands cepstral coefficients. 152 RTC_DCHECK_EQ(cepstrum.size() - kNumLowerBands, higher_bands_cepstrum.size()); 153 std::copy(cepstrum.begin() + kNumLowerBands, cepstrum.end(), 154 higher_bands_cepstrum.begin()); 155 // Compute and write remaining features. 156 ComputeAvgAndDerivatives(average, first_derivative, second_derivative); 157 ComputeNormalizedCepstralCorrelation(bands_cross_corr); 158 RTC_DCHECK(variability); 159 *variability = ComputeVariability(); 160 return false; 161 } 162 163 void SpectralFeaturesExtractor::ComputeAvgAndDerivatives( 164 ArrayView<float, kNumLowerBands> average, 165 ArrayView<float, kNumLowerBands> first_derivative, 166 ArrayView<float, kNumLowerBands> second_derivative) const { 167 auto curr = cepstral_coeffs_ring_buf_.GetArrayView(0); 168 auto prev1 = cepstral_coeffs_ring_buf_.GetArrayView(1); 169 auto prev2 = cepstral_coeffs_ring_buf_.GetArrayView(2); 170 RTC_DCHECK_EQ(average.size(), first_derivative.size()); 171 RTC_DCHECK_EQ(first_derivative.size(), second_derivative.size()); 172 RTC_DCHECK_LE(average.size(), curr.size()); 173 for (int i = 0; SafeLt(i, average.size()); ++i) { 174 // Average, kernel: [1, 1, 1]. 175 average[i] = curr[i] + prev1[i] + prev2[i]; 176 // First derivative, kernel: [1, 0, - 1]. 177 first_derivative[i] = curr[i] - prev2[i]; 178 // Second derivative, Laplacian kernel: [1, -2, 1]. 179 second_derivative[i] = curr[i] - 2 * prev1[i] + prev2[i]; 180 } 181 } 182 183 void SpectralFeaturesExtractor::ComputeNormalizedCepstralCorrelation( 184 ArrayView<float, kNumLowerBands> bands_cross_corr) { 185 spectral_correlator_.ComputeCrossCorrelation( 186 reference_frame_fft_->GetConstView(), lagged_frame_fft_->GetConstView(), 187 bands_cross_corr_); 188 // Normalize. 189 for (int i = 0; SafeLt(i, bands_cross_corr_.size()); ++i) { 190 bands_cross_corr_[i] = 191 bands_cross_corr_[i] / 192 std::sqrt(0.001f + reference_frame_bands_energy_[i] * 193 lagged_frame_bands_energy_[i]); 194 } 195 // Cepstrum. 196 ComputeDct(bands_cross_corr_, dct_table_, bands_cross_corr); 197 // Ad-hoc correction terms for the first two cepstral coefficients. 198 bands_cross_corr[0] -= 1.3f; 199 bands_cross_corr[1] -= 0.9f; 200 } 201 202 float SpectralFeaturesExtractor::ComputeVariability() const { 203 // Compute cepstral variability score. 204 float variability = 0.f; 205 for (int delay1 = 0; delay1 < kCepstralCoeffsHistorySize; ++delay1) { 206 float min_dist = std::numeric_limits<float>::max(); 207 for (int delay2 = 0; delay2 < kCepstralCoeffsHistorySize; ++delay2) { 208 if (delay1 == delay2) // The distance would be 0. 209 continue; 210 min_dist = 211 std::min(min_dist, cepstral_diffs_buf_.GetValue(delay1, delay2)); 212 } 213 variability += min_dist; 214 } 215 // Normalize (based on training set stats). 216 // TODO(bugs.webrtc.org/10480): Isolate normalization from feature extraction. 217 return variability / kCepstralCoeffsHistorySize - 2.1f; 218 } 219 220 } // namespace rnn_vad 221 } // namespace webrtc