features_extraction.cc (3777B)
1 /* 2 * Copyright (c) 2018 The WebRTC project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "modules/audio_processing/agc2/rnn_vad/features_extraction.h" 12 13 #include <array> 14 15 #include "api/array_view.h" 16 #include "modules/audio_processing/agc2/biquad_filter.h" 17 #include "modules/audio_processing/agc2/cpu_features.h" 18 #include "modules/audio_processing/agc2/rnn_vad/common.h" 19 #include "modules/audio_processing/agc2/rnn_vad/lp_residual.h" 20 #include "rtc_base/checks.h" 21 22 namespace webrtc { 23 namespace rnn_vad { 24 namespace { 25 26 // Computed as `scipy.signal.butter(N=2, Wn=60/24000, btype='highpass')`. 27 constexpr BiQuadFilter::Config kHpfConfig24k{ 28 .b = {0.99446179f, -1.98892358f, 0.99446179f}, 29 .a = {-1.98889291f, 0.98895425f}}; 30 31 } // namespace 32 33 FeaturesExtractor::FeaturesExtractor(const AvailableCpuFeatures& cpu_features) 34 : use_high_pass_filter_(false), 35 hpf_(kHpfConfig24k), 36 pitch_buf_24kHz_(), 37 pitch_buf_24kHz_view_(pitch_buf_24kHz_.GetBufferView()), 38 lp_residual_(kBufSize24kHz), 39 lp_residual_view_(lp_residual_.data(), kBufSize24kHz), 40 pitch_estimator_(cpu_features), 41 reference_frame_view_(pitch_buf_24kHz_.GetMostRecentValuesView()) { 42 RTC_DCHECK_EQ(kBufSize24kHz, lp_residual_.size()); 43 Reset(); 44 } 45 46 FeaturesExtractor::~FeaturesExtractor() = default; 47 48 void FeaturesExtractor::Reset() { 49 pitch_buf_24kHz_.Reset(); 50 spectral_features_extractor_.Reset(); 51 if (use_high_pass_filter_) { 52 hpf_.Reset(); 53 } 54 } 55 56 bool FeaturesExtractor::CheckSilenceComputeFeatures( 57 ArrayView<const float, kFrameSize10ms24kHz> samples, 58 ArrayView<float, kFeatureVectorSize> feature_vector) { 59 // Pre-processing. 60 if (use_high_pass_filter_) { 61 std::array<float, kFrameSize10ms24kHz> samples_filtered; 62 hpf_.Process(samples, samples_filtered); 63 // Feed buffer with the pre-processed version of `samples`. 64 pitch_buf_24kHz_.Push(samples_filtered); 65 } else { 66 // Feed buffer with `samples`. 67 pitch_buf_24kHz_.Push(samples); 68 } 69 // Extract the LP residual. 70 float lpc_coeffs[kNumLpcCoefficients]; 71 ComputeAndPostProcessLpcCoefficients(pitch_buf_24kHz_view_, lpc_coeffs); 72 ComputeLpResidual(lpc_coeffs, pitch_buf_24kHz_view_, lp_residual_view_); 73 // Estimate pitch on the LP-residual and write the normalized pitch period 74 // into the output vector (normalization based on training data stats). 75 pitch_period_48kHz_ = pitch_estimator_.Estimate(lp_residual_view_); 76 feature_vector[kFeatureVectorSize - 2] = 0.01f * (pitch_period_48kHz_ - 300); 77 // Extract lagged frames (according to the estimated pitch period). 78 RTC_DCHECK_LE(pitch_period_48kHz_ / 2, kMaxPitch24kHz); 79 auto lagged_frame = pitch_buf_24kHz_view_.subview( 80 kMaxPitch24kHz - pitch_period_48kHz_ / 2, kFrameSize20ms24kHz); 81 // Analyze reference and lagged frames checking if silence has been detected 82 // and write the feature vector. 83 return spectral_features_extractor_.CheckSilenceComputeFeatures( 84 reference_frame_view_, {lagged_frame.data(), kFrameSize20ms24kHz}, 85 {feature_vector.data() + kNumLowerBands, kNumBands - kNumLowerBands}, 86 {feature_vector.data(), kNumLowerBands}, 87 {feature_vector.data() + kNumBands, kNumLowerBands}, 88 {feature_vector.data() + kNumBands + kNumLowerBands, kNumLowerBands}, 89 {feature_vector.data() + kNumBands + 2 * kNumLowerBands, kNumLowerBands}, 90 &feature_vector[kFeatureVectorSize - 1]); 91 } 92 93 } // namespace rnn_vad 94 } // namespace webrtc