speech_level_estimator.cc (7074B)
1 /* 2 * Copyright (c) 2018 The WebRTC project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "modules/audio_processing/agc2/speech_level_estimator.h" 12 13 #include "api/audio/audio_processing.h" 14 #include "modules/audio_processing/agc2/agc2_common.h" 15 #include "modules/audio_processing/logging/apm_data_dumper.h" 16 #include "rtc_base/checks.h" 17 #include "rtc_base/numerics/safe_minmax.h" 18 19 namespace webrtc { 20 namespace { 21 22 float ClampLevelEstimateDbfs(float level_estimate_dbfs) { 23 return SafeClamp<float>(level_estimate_dbfs, -90.0f, 30.0f); 24 } 25 26 // Returns the initial speech level estimate needed to apply the initial gain. 27 float GetInitialSpeechLevelEstimateDbfs( 28 const AudioProcessing::Config::GainController2::AdaptiveDigital& config) { 29 return ClampLevelEstimateDbfs(-kSaturationProtectorInitialHeadroomDb - 30 config.initial_gain_db - config.headroom_db); 31 } 32 33 } // namespace 34 35 bool SpeechLevelEstimator::LevelEstimatorState::operator==( 36 const SpeechLevelEstimator::LevelEstimatorState& b) const { 37 return time_to_confidence_ms == b.time_to_confidence_ms && 38 level_dbfs.numerator == b.level_dbfs.numerator && 39 level_dbfs.denominator == b.level_dbfs.denominator; 40 } 41 42 float SpeechLevelEstimator::LevelEstimatorState::Ratio::GetRatio() const { 43 RTC_DCHECK_NE(denominator, 0.f); 44 return numerator / denominator; 45 } 46 47 SpeechLevelEstimator::SpeechLevelEstimator( 48 ApmDataDumper* apm_data_dumper, 49 const AudioProcessing::Config::GainController2::AdaptiveDigital& config, 50 int adjacent_speech_frames_threshold) 51 : apm_data_dumper_(apm_data_dumper), 52 initial_speech_level_dbfs_(GetInitialSpeechLevelEstimateDbfs(config)), 53 adjacent_speech_frames_threshold_(adjacent_speech_frames_threshold), 54 level_dbfs_(initial_speech_level_dbfs_), 55 // TODO(bugs.webrtc.org/7494): Remove init below when AGC2 input volume 56 // controller temporal dependency removed. 57 is_confident_(false) { 58 RTC_DCHECK(apm_data_dumper_); 59 RTC_DCHECK_GE(adjacent_speech_frames_threshold_, 1); 60 Reset(); 61 } 62 63 void SpeechLevelEstimator::Update(float rms_dbfs, 64 float peak_dbfs, 65 float speech_probability) { 66 RTC_DCHECK_GT(rms_dbfs, -150.0f); 67 RTC_DCHECK_LT(rms_dbfs, 50.0f); 68 RTC_DCHECK_GT(peak_dbfs, -150.0f); 69 RTC_DCHECK_LT(peak_dbfs, 50.0f); 70 RTC_DCHECK_GE(speech_probability, 0.0f); 71 RTC_DCHECK_LE(speech_probability, 1.0f); 72 if (speech_probability < kVadConfidenceThreshold) { 73 // Not a speech frame. 74 if (adjacent_speech_frames_threshold_ > 1) { 75 // When two or more adjacent speech frames are required in order to update 76 // the state, we need to decide whether to discard or confirm the updates 77 // based on the speech sequence length. 78 if (num_adjacent_speech_frames_ >= adjacent_speech_frames_threshold_) { 79 // First non-speech frame after a long enough sequence of speech frames. 80 // Update the reliable state. 81 reliable_state_ = preliminary_state_; 82 } else if (num_adjacent_speech_frames_ > 0) { 83 // First non-speech frame after a too short sequence of speech frames. 84 // Reset to the last reliable state. 85 preliminary_state_ = reliable_state_; 86 } 87 } 88 num_adjacent_speech_frames_ = 0; 89 } else { 90 // Speech frame observed. 91 num_adjacent_speech_frames_++; 92 93 // Update preliminary level estimate. 94 RTC_DCHECK_GE(preliminary_state_.time_to_confidence_ms, 0); 95 const bool buffer_is_full = preliminary_state_.time_to_confidence_ms == 0; 96 if (!buffer_is_full) { 97 preliminary_state_.time_to_confidence_ms -= kFrameDurationMs; 98 } 99 // Weighted average of levels with speech probability as weight. 100 RTC_DCHECK_GT(speech_probability, 0.0f); 101 const float leak_factor = buffer_is_full ? kLevelEstimatorLeakFactor : 1.0f; 102 preliminary_state_.level_dbfs.numerator = 103 preliminary_state_.level_dbfs.numerator * leak_factor + 104 rms_dbfs * speech_probability; 105 preliminary_state_.level_dbfs.denominator = 106 preliminary_state_.level_dbfs.denominator * leak_factor + 107 speech_probability; 108 109 const float level_dbfs = preliminary_state_.level_dbfs.GetRatio(); 110 111 if (num_adjacent_speech_frames_ >= adjacent_speech_frames_threshold_) { 112 // `preliminary_state_` is now reliable. Update the last level estimation. 113 level_dbfs_ = ClampLevelEstimateDbfs(level_dbfs); 114 } 115 } 116 UpdateIsConfident(); 117 DumpDebugData(); 118 } 119 120 void SpeechLevelEstimator::UpdateIsConfident() { 121 if (adjacent_speech_frames_threshold_ == 1) { 122 // Ignore `reliable_state_` when a single frame is enough to update the 123 // level estimate (because it is not used). 124 is_confident_ = preliminary_state_.time_to_confidence_ms == 0; 125 return; 126 } 127 // Once confident, it remains confident. 128 RTC_DCHECK(reliable_state_.time_to_confidence_ms != 0 || 129 preliminary_state_.time_to_confidence_ms == 0); 130 // During the first long enough speech sequence, `reliable_state_` must be 131 // ignored since `preliminary_state_` is used. 132 is_confident_ = 133 reliable_state_.time_to_confidence_ms == 0 || 134 (num_adjacent_speech_frames_ >= adjacent_speech_frames_threshold_ && 135 preliminary_state_.time_to_confidence_ms == 0); 136 } 137 138 void SpeechLevelEstimator::Reset() { 139 ResetLevelEstimatorState(preliminary_state_); 140 ResetLevelEstimatorState(reliable_state_); 141 level_dbfs_ = initial_speech_level_dbfs_; 142 num_adjacent_speech_frames_ = 0; 143 } 144 145 void SpeechLevelEstimator::ResetLevelEstimatorState( 146 LevelEstimatorState& state) const { 147 state.time_to_confidence_ms = kLevelEstimatorTimeToConfidenceMs; 148 state.level_dbfs.numerator = initial_speech_level_dbfs_; 149 state.level_dbfs.denominator = 1.0f; 150 } 151 152 void SpeechLevelEstimator::DumpDebugData() const { 153 if (!apm_data_dumper_) 154 return; 155 apm_data_dumper_->DumpRaw("agc2_speech_level_dbfs", level_dbfs_); 156 apm_data_dumper_->DumpRaw("agc2_speech_level_is_confident", is_confident_); 157 apm_data_dumper_->DumpRaw( 158 "agc2_adaptive_level_estimator_num_adjacent_speech_frames", 159 num_adjacent_speech_frames_); 160 apm_data_dumper_->DumpRaw( 161 "agc2_adaptive_level_estimator_preliminary_level_estimate_num", 162 preliminary_state_.level_dbfs.numerator); 163 apm_data_dumper_->DumpRaw( 164 "agc2_adaptive_level_estimator_preliminary_level_estimate_den", 165 preliminary_state_.level_dbfs.denominator); 166 apm_data_dumper_->DumpRaw( 167 "agc2_adaptive_level_estimator_preliminary_time_to_confidence_ms", 168 preliminary_state_.time_to_confidence_ms); 169 apm_data_dumper_->DumpRaw( 170 "agc2_adaptive_level_estimator_reliable_time_to_confidence_ms", 171 reliable_state_.time_to_confidence_ms); 172 } 173 174 } // namespace webrtc