residual_echo_estimator.cc (16499B)
1 /* 2 * Copyright (c) 2017 The WebRTC project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "modules/audio_processing/aec3/residual_echo_estimator.h" 12 13 #include <algorithm> 14 #include <array> 15 #include <cstddef> 16 #include <vector> 17 18 #include "api/array_view.h" 19 #include "api/audio/echo_canceller3_config.h" 20 #include "api/environment/environment.h" 21 #include "api/field_trials_view.h" 22 #include "modules/audio_processing/aec3/aec3_common.h" 23 #include "modules/audio_processing/aec3/aec_state.h" 24 #include "modules/audio_processing/aec3/render_buffer.h" 25 #include "modules/audio_processing/aec3/reverb_model.h" 26 #include "modules/audio_processing/aec3/spectrum_buffer.h" 27 #include "rtc_base/checks.h" 28 29 namespace webrtc { 30 namespace { 31 32 constexpr float kDefaultTransparentModeGain = 0.01f; 33 34 float GetTransparentModeGain() { 35 return kDefaultTransparentModeGain; 36 } 37 38 float GetEarlyReflectionsDefaultModeGain( 39 const FieldTrialsView& field_trials, 40 const EchoCanceller3Config::EpStrength& config) { 41 if (field_trials.IsEnabled("WebRTC-Aec3UseLowEarlyReflectionsDefaultGain")) { 42 return 0.1f; 43 } 44 return config.default_gain; 45 } 46 47 float GetLateReflectionsDefaultModeGain( 48 const FieldTrialsView& field_trials, 49 const EchoCanceller3Config::EpStrength& config) { 50 if (field_trials.IsEnabled("WebRTC-Aec3UseLowLateReflectionsDefaultGain")) { 51 return 0.1f; 52 } 53 return config.default_gain; 54 } 55 56 bool UseErleOnsetCompensationInDominantNearend( 57 const FieldTrialsView& field_trials, 58 const EchoCanceller3Config::EpStrength& config) { 59 return config.erle_onset_compensation_in_dominant_nearend || 60 field_trials.IsEnabled( 61 "WebRTC-Aec3UseErleOnsetCompensationInDominantNearend"); 62 } 63 64 // Computes the indexes that will be used for computing spectral power over 65 // the blocks surrounding the delay. 66 void GetRenderIndexesToAnalyze( 67 const SpectrumBuffer& spectrum_buffer, 68 const EchoCanceller3Config::EchoModel& echo_model, 69 int filter_delay_blocks, 70 int* idx_start, 71 int* idx_stop) { 72 RTC_DCHECK(idx_start); 73 RTC_DCHECK(idx_stop); 74 size_t window_start; 75 size_t window_end; 76 window_start = 77 std::max(0, filter_delay_blocks - 78 static_cast<int>(echo_model.render_pre_window_size)); 79 window_end = filter_delay_blocks + 80 static_cast<int>(echo_model.render_post_window_size); 81 *idx_start = spectrum_buffer.OffsetIndex(spectrum_buffer.read, window_start); 82 *idx_stop = spectrum_buffer.OffsetIndex(spectrum_buffer.read, window_end + 1); 83 } 84 85 // Estimates the residual echo power based on the echo return loss enhancement 86 // (ERLE) and the linear power estimate. 87 void LinearEstimate( 88 ArrayView<const std::array<float, kFftLengthBy2Plus1>> S2_linear, 89 ArrayView<const std::array<float, kFftLengthBy2Plus1>> erle, 90 ArrayView<std::array<float, kFftLengthBy2Plus1>> R2) { 91 RTC_DCHECK_EQ(S2_linear.size(), erle.size()); 92 RTC_DCHECK_EQ(S2_linear.size(), R2.size()); 93 94 const size_t num_capture_channels = R2.size(); 95 for (size_t ch = 0; ch < num_capture_channels; ++ch) { 96 for (size_t k = 0; k < kFftLengthBy2Plus1; ++k) { 97 RTC_DCHECK_LT(0.f, erle[ch][k]); 98 R2[ch][k] = S2_linear[ch][k] / erle[ch][k]; 99 } 100 } 101 } 102 103 // Estimates the residual echo power based on the estimate of the echo path 104 // gain. 105 void NonLinearEstimate(float echo_path_gain, 106 const std::array<float, kFftLengthBy2Plus1>& X2, 107 ArrayView<std::array<float, kFftLengthBy2Plus1>> R2) { 108 const size_t num_capture_channels = R2.size(); 109 for (size_t ch = 0; ch < num_capture_channels; ++ch) { 110 for (size_t k = 0; k < kFftLengthBy2Plus1; ++k) { 111 R2[ch][k] = X2[k] * echo_path_gain; 112 } 113 } 114 } 115 116 // Applies a soft noise gate to the echo generating power. 117 void ApplyNoiseGate(const EchoCanceller3Config::EchoModel& config, 118 ArrayView<float, kFftLengthBy2Plus1> X2) { 119 for (size_t k = 0; k < kFftLengthBy2Plus1; ++k) { 120 if (config.noise_gate_power > X2[k]) { 121 X2[k] = std::max(0.f, X2[k] - config.noise_gate_slope * 122 (config.noise_gate_power - X2[k])); 123 } 124 } 125 } 126 127 // Estimates the echo generating signal power as gated maximal power over a 128 // time window. 129 void EchoGeneratingPower(size_t num_render_channels, 130 const SpectrumBuffer& spectrum_buffer, 131 const EchoCanceller3Config::EchoModel& echo_model, 132 int filter_delay_blocks, 133 ArrayView<float, kFftLengthBy2Plus1> X2) { 134 int idx_stop; 135 int idx_start; 136 GetRenderIndexesToAnalyze(spectrum_buffer, echo_model, filter_delay_blocks, 137 &idx_start, &idx_stop); 138 139 std::fill(X2.begin(), X2.end(), 0.f); 140 if (num_render_channels == 1) { 141 for (int k = idx_start; k != idx_stop; k = spectrum_buffer.IncIndex(k)) { 142 for (size_t j = 0; j < kFftLengthBy2Plus1; ++j) { 143 X2[j] = std::max(X2[j], spectrum_buffer.buffer[k][/*channel=*/0][j]); 144 } 145 } 146 } else { 147 for (int k = idx_start; k != idx_stop; k = spectrum_buffer.IncIndex(k)) { 148 std::array<float, kFftLengthBy2Plus1> render_power; 149 render_power.fill(0.f); 150 for (size_t ch = 0; ch < num_render_channels; ++ch) { 151 const auto& channel_power = spectrum_buffer.buffer[k][ch]; 152 for (size_t j = 0; j < kFftLengthBy2Plus1; ++j) { 153 render_power[j] += channel_power[j]; 154 } 155 } 156 for (size_t j = 0; j < kFftLengthBy2Plus1; ++j) { 157 X2[j] = std::max(X2[j], render_power[j]); 158 } 159 } 160 } 161 } 162 163 } // namespace 164 165 ResidualEchoEstimator::ResidualEchoEstimator( 166 const Environment& env, 167 const EchoCanceller3Config& config, 168 size_t num_render_channels, 169 NeuralResidualEchoEstimator* neural_residual_echo_estimator) 170 : config_(config), 171 num_render_channels_(num_render_channels), 172 early_reflections_transparent_mode_gain_(GetTransparentModeGain()), 173 late_reflections_transparent_mode_gain_(GetTransparentModeGain()), 174 early_reflections_general_gain_( 175 GetEarlyReflectionsDefaultModeGain(env.field_trials(), 176 config_.ep_strength)), 177 late_reflections_general_gain_( 178 GetLateReflectionsDefaultModeGain(env.field_trials(), 179 config_.ep_strength)), 180 erle_onset_compensation_in_dominant_nearend_( 181 UseErleOnsetCompensationInDominantNearend(env.field_trials(), 182 config_.ep_strength)), 183 neural_residual_echo_estimator_(neural_residual_echo_estimator) { 184 Reset(); 185 } 186 187 ResidualEchoEstimator::~ResidualEchoEstimator() = default; 188 189 void ResidualEchoEstimator::Estimate( 190 const AecState& aec_state, 191 const RenderBuffer& render_buffer, 192 ArrayView<const std::array<float, kFftLengthBy2>> capture, 193 ArrayView<const std::array<float, kFftLengthBy2>> linear_aec_output, 194 ArrayView<const std::array<float, kFftLengthBy2Plus1>> S2_linear, 195 ArrayView<const std::array<float, kFftLengthBy2Plus1>> Y2, 196 ArrayView<const std::array<float, kFftLengthBy2Plus1>> E2, 197 bool dominant_nearend, 198 ArrayView<std::array<float, kFftLengthBy2Plus1>> R2, 199 ArrayView<std::array<float, kFftLengthBy2Plus1>> R2_unbounded) { 200 RTC_DCHECK_EQ(R2.size(), Y2.size()); 201 RTC_DCHECK_EQ(R2.size(), S2_linear.size()); 202 203 const size_t num_capture_channels = R2.size(); 204 205 // Estimate the power of the stationary noise in the render signal. 206 UpdateRenderNoisePower(render_buffer); 207 208 // The neural residual echo estimation always runs, even if the estimated 209 // spectra |R2| and |R2_unbounded| are overwritten later. This ensures the 210 // estimator sees continuous signals at a constant time rate. 211 if (neural_residual_echo_estimator_ != nullptr) { 212 constexpr int kNeuralDelayHeadroomMs = 12; 213 constexpr int kNeuralDelayHeadroomBlocks = 214 kNeuralDelayHeadroomMs / kBlockSizeMs; 215 constexpr int kJitterMarginBlocks = 3; 216 std::optional<DelayEstimate> external_delay_blocks = 217 aec_state.ExternalDelayBlocks(); 218 int headroom_blocks = 0; 219 int headroom_render_buffer = render_buffer.Headroom(); 220 if (external_delay_blocks && 221 external_delay_blocks->delay > 222 kNeuralDelayHeadroomBlocks + kJitterMarginBlocks && 223 headroom_render_buffer > 0) { 224 headroom_blocks = 225 std::min(headroom_render_buffer - 1, kNeuralDelayHeadroomBlocks); 226 } 227 ArrayView<const float> render = 228 render_buffer.GetBlock(headroom_blocks).View(/*band=*/0, /*ch=*/0); 229 neural_residual_echo_estimator_->Estimate(render, capture, 230 linear_aec_output, S2_linear, Y2, 231 E2, R2, R2_unbounded); 232 } 233 234 // Estimate the residual echo power. 235 if (aec_state.UsableLinearEstimate()) { 236 if (neural_residual_echo_estimator_ == nullptr) { 237 // When there is saturated echo, assume the same spectral content as is 238 // present in the microphone signal. 239 if (aec_state.SaturatedEcho()) { 240 for (size_t ch = 0; ch < num_capture_channels; ++ch) { 241 std::copy(Y2[ch].begin(), Y2[ch].end(), R2[ch].begin()); 242 std::copy(Y2[ch].begin(), Y2[ch].end(), R2_unbounded[ch].begin()); 243 } 244 } else { 245 const bool onset_compensated = 246 erle_onset_compensation_in_dominant_nearend_ || !dominant_nearend; 247 LinearEstimate(S2_linear, aec_state.Erle(onset_compensated), R2); 248 LinearEstimate(S2_linear, aec_state.ErleUnbounded(), R2_unbounded); 249 } 250 251 UpdateReverb(ReverbType::kLinear, aec_state, render_buffer, 252 dominant_nearend); 253 AddReverb(R2); 254 AddReverb(R2_unbounded); 255 } 256 } else { 257 const float echo_path_gain = 258 GetEchoPathGain(aec_state, /*gain_for_early_reflections=*/true); 259 260 // When there is saturated echo, assume the same spectral content as is 261 // present in the microphone signal. 262 if (aec_state.SaturatedEcho()) { 263 for (size_t ch = 0; ch < num_capture_channels; ++ch) { 264 std::copy(Y2[ch].begin(), Y2[ch].end(), R2[ch].begin()); 265 std::copy(Y2[ch].begin(), Y2[ch].end(), R2_unbounded[ch].begin()); 266 } 267 } else { 268 // Estimate the echo generating signal power. 269 std::array<float, kFftLengthBy2Plus1> X2; 270 EchoGeneratingPower(num_render_channels_, 271 render_buffer.GetSpectrumBuffer(), config_.echo_model, 272 aec_state.MinDirectPathFilterDelay(), X2); 273 if (!aec_state.UseStationarityProperties()) { 274 ApplyNoiseGate(config_.echo_model, X2); 275 } 276 277 // Subtract the stationary noise power to avoid stationary noise causing 278 // excessive echo suppression. 279 for (size_t k = 0; k < kFftLengthBy2Plus1; ++k) { 280 X2[k] -= config_.echo_model.stationary_gate_slope * X2_noise_floor_[k]; 281 X2[k] = std::max(0.f, X2[k]); 282 } 283 284 NonLinearEstimate(echo_path_gain, X2, R2); 285 NonLinearEstimate(echo_path_gain, X2, R2_unbounded); 286 } 287 288 if (config_.echo_model.model_reverb_in_nonlinear_mode && 289 !aec_state.TransparentModeActive()) { 290 UpdateReverb(ReverbType::kNonLinear, aec_state, render_buffer, 291 dominant_nearend); 292 AddReverb(R2); 293 AddReverb(R2_unbounded); 294 } 295 } 296 297 if (aec_state.UseStationarityProperties()) { 298 // Scale the echo according to echo audibility. 299 std::array<float, kFftLengthBy2Plus1> residual_scaling; 300 aec_state.GetResidualEchoScaling(residual_scaling); 301 for (size_t ch = 0; ch < num_capture_channels; ++ch) { 302 for (size_t k = 0; k < kFftLengthBy2Plus1; ++k) { 303 R2[ch][k] *= residual_scaling[k]; 304 R2_unbounded[ch][k] *= residual_scaling[k]; 305 } 306 } 307 } 308 } 309 310 void ResidualEchoEstimator::Reset() { 311 echo_reverb_.Reset(); 312 X2_noise_floor_counter_.fill(config_.echo_model.noise_floor_hold); 313 X2_noise_floor_.fill(config_.echo_model.min_noise_floor_power); 314 } 315 316 void ResidualEchoEstimator::UpdateRenderNoisePower( 317 const RenderBuffer& render_buffer) { 318 std::array<float, kFftLengthBy2Plus1> render_power_data; 319 ArrayView<const std::array<float, kFftLengthBy2Plus1>> X2 = 320 render_buffer.Spectrum(0); 321 ArrayView<const float, kFftLengthBy2Plus1> render_power = X2[/*channel=*/0]; 322 if (num_render_channels_ > 1) { 323 render_power_data.fill(0.f); 324 for (size_t ch = 0; ch < num_render_channels_; ++ch) { 325 const auto& channel_power = X2[ch]; 326 for (size_t k = 0; k < kFftLengthBy2Plus1; ++k) { 327 render_power_data[k] += channel_power[k]; 328 } 329 } 330 render_power = render_power_data; 331 } 332 333 // Estimate the stationary noise power in a minimum statistics manner. 334 for (size_t k = 0; k < kFftLengthBy2Plus1; ++k) { 335 // Decrease rapidly. 336 if (render_power[k] < X2_noise_floor_[k]) { 337 X2_noise_floor_[k] = render_power[k]; 338 X2_noise_floor_counter_[k] = 0; 339 } else { 340 // Increase in a delayed, leaky manner. 341 if (X2_noise_floor_counter_[k] >= 342 static_cast<int>(config_.echo_model.noise_floor_hold)) { 343 X2_noise_floor_[k] = std::max(X2_noise_floor_[k] * 1.1f, 344 config_.echo_model.min_noise_floor_power); 345 } else { 346 ++X2_noise_floor_counter_[k]; 347 } 348 } 349 } 350 } 351 352 // Updates the reverb estimation. 353 void ResidualEchoEstimator::UpdateReverb(ReverbType reverb_type, 354 const AecState& aec_state, 355 const RenderBuffer& render_buffer, 356 bool dominant_nearend) { 357 // Choose reverb partition based on what type of echo power model is used. 358 const size_t first_reverb_partition = 359 reverb_type == ReverbType::kLinear 360 ? aec_state.FilterLengthBlocks() + 1 361 : aec_state.MinDirectPathFilterDelay() + 1; 362 363 // Compute render power for the reverb. 364 std::array<float, kFftLengthBy2Plus1> render_power_data; 365 ArrayView<const std::array<float, kFftLengthBy2Plus1>> X2 = 366 render_buffer.Spectrum(first_reverb_partition); 367 ArrayView<const float, kFftLengthBy2Plus1> render_power = X2[/*channel=*/0]; 368 if (num_render_channels_ > 1) { 369 render_power_data.fill(0.f); 370 for (size_t ch = 0; ch < num_render_channels_; ++ch) { 371 const auto& channel_power = X2[ch]; 372 for (size_t k = 0; k < kFftLengthBy2Plus1; ++k) { 373 render_power_data[k] += channel_power[k]; 374 } 375 } 376 render_power = render_power_data; 377 } 378 379 // Update the reverb estimate. 380 float reverb_decay = aec_state.ReverbDecay(/*mild=*/dominant_nearend); 381 if (reverb_type == ReverbType::kLinear) { 382 echo_reverb_.UpdateReverb( 383 render_power, aec_state.GetReverbFrequencyResponse(), reverb_decay); 384 } else { 385 const float echo_path_gain = 386 GetEchoPathGain(aec_state, /*gain_for_early_reflections=*/false); 387 echo_reverb_.UpdateReverbNoFreqShaping(render_power, echo_path_gain, 388 reverb_decay); 389 } 390 } 391 // Adds the estimated power of the reverb to the residual echo power. 392 void ResidualEchoEstimator::AddReverb( 393 ArrayView<std::array<float, kFftLengthBy2Plus1>> R2) const { 394 const size_t num_capture_channels = R2.size(); 395 396 // Add the reverb power. 397 ArrayView<const float, kFftLengthBy2Plus1> reverb_power = 398 echo_reverb_.reverb(); 399 for (size_t ch = 0; ch < num_capture_channels; ++ch) { 400 for (size_t k = 0; k < kFftLengthBy2Plus1; ++k) { 401 R2[ch][k] += reverb_power[k]; 402 } 403 } 404 } 405 406 // Chooses the echo path gain to use. 407 float ResidualEchoEstimator::GetEchoPathGain( 408 const AecState& aec_state, 409 bool gain_for_early_reflections) const { 410 float gain_amplitude; 411 if (aec_state.TransparentModeActive()) { 412 gain_amplitude = gain_for_early_reflections 413 ? early_reflections_transparent_mode_gain_ 414 : late_reflections_transparent_mode_gain_; 415 } else { 416 gain_amplitude = gain_for_early_reflections 417 ? early_reflections_general_gain_ 418 : late_reflections_general_gain_; 419 } 420 return gain_amplitude * gain_amplitude; 421 } 422 423 } // namespace webrtc