tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

residual_echo_estimator.cc (16499B)


      1 /*
      2 *  Copyright (c) 2017 The WebRTC project authors. All Rights Reserved.
      3 *
      4 *  Use of this source code is governed by a BSD-style license
      5 *  that can be found in the LICENSE file in the root of the source
      6 *  tree. An additional intellectual property rights grant can be found
      7 *  in the file PATENTS.  All contributing project authors may
      8 *  be found in the AUTHORS file in the root of the source tree.
      9 */
     10 
     11 #include "modules/audio_processing/aec3/residual_echo_estimator.h"
     12 
     13 #include <algorithm>
     14 #include <array>
     15 #include <cstddef>
     16 #include <vector>
     17 
     18 #include "api/array_view.h"
     19 #include "api/audio/echo_canceller3_config.h"
     20 #include "api/environment/environment.h"
     21 #include "api/field_trials_view.h"
     22 #include "modules/audio_processing/aec3/aec3_common.h"
     23 #include "modules/audio_processing/aec3/aec_state.h"
     24 #include "modules/audio_processing/aec3/render_buffer.h"
     25 #include "modules/audio_processing/aec3/reverb_model.h"
     26 #include "modules/audio_processing/aec3/spectrum_buffer.h"
     27 #include "rtc_base/checks.h"
     28 
     29 namespace webrtc {
     30 namespace {
     31 
     32 constexpr float kDefaultTransparentModeGain = 0.01f;
     33 
     34 float GetTransparentModeGain() {
     35  return kDefaultTransparentModeGain;
     36 }
     37 
     38 float GetEarlyReflectionsDefaultModeGain(
     39    const FieldTrialsView& field_trials,
     40    const EchoCanceller3Config::EpStrength& config) {
     41  if (field_trials.IsEnabled("WebRTC-Aec3UseLowEarlyReflectionsDefaultGain")) {
     42    return 0.1f;
     43  }
     44  return config.default_gain;
     45 }
     46 
     47 float GetLateReflectionsDefaultModeGain(
     48    const FieldTrialsView& field_trials,
     49    const EchoCanceller3Config::EpStrength& config) {
     50  if (field_trials.IsEnabled("WebRTC-Aec3UseLowLateReflectionsDefaultGain")) {
     51    return 0.1f;
     52  }
     53  return config.default_gain;
     54 }
     55 
     56 bool UseErleOnsetCompensationInDominantNearend(
     57    const FieldTrialsView& field_trials,
     58    const EchoCanceller3Config::EpStrength& config) {
     59  return config.erle_onset_compensation_in_dominant_nearend ||
     60         field_trials.IsEnabled(
     61             "WebRTC-Aec3UseErleOnsetCompensationInDominantNearend");
     62 }
     63 
     64 // Computes the indexes that will be used for computing spectral power over
     65 // the blocks surrounding the delay.
     66 void GetRenderIndexesToAnalyze(
     67    const SpectrumBuffer& spectrum_buffer,
     68    const EchoCanceller3Config::EchoModel& echo_model,
     69    int filter_delay_blocks,
     70    int* idx_start,
     71    int* idx_stop) {
     72  RTC_DCHECK(idx_start);
     73  RTC_DCHECK(idx_stop);
     74  size_t window_start;
     75  size_t window_end;
     76  window_start =
     77      std::max(0, filter_delay_blocks -
     78                      static_cast<int>(echo_model.render_pre_window_size));
     79  window_end = filter_delay_blocks +
     80               static_cast<int>(echo_model.render_post_window_size);
     81  *idx_start = spectrum_buffer.OffsetIndex(spectrum_buffer.read, window_start);
     82  *idx_stop = spectrum_buffer.OffsetIndex(spectrum_buffer.read, window_end + 1);
     83 }
     84 
     85 // Estimates the residual echo power based on the echo return loss enhancement
     86 // (ERLE) and the linear power estimate.
     87 void LinearEstimate(
     88    ArrayView<const std::array<float, kFftLengthBy2Plus1>> S2_linear,
     89    ArrayView<const std::array<float, kFftLengthBy2Plus1>> erle,
     90    ArrayView<std::array<float, kFftLengthBy2Plus1>> R2) {
     91  RTC_DCHECK_EQ(S2_linear.size(), erle.size());
     92  RTC_DCHECK_EQ(S2_linear.size(), R2.size());
     93 
     94  const size_t num_capture_channels = R2.size();
     95  for (size_t ch = 0; ch < num_capture_channels; ++ch) {
     96    for (size_t k = 0; k < kFftLengthBy2Plus1; ++k) {
     97      RTC_DCHECK_LT(0.f, erle[ch][k]);
     98      R2[ch][k] = S2_linear[ch][k] / erle[ch][k];
     99    }
    100  }
    101 }
    102 
    103 // Estimates the residual echo power based on the estimate of the echo path
    104 // gain.
    105 void NonLinearEstimate(float echo_path_gain,
    106                       const std::array<float, kFftLengthBy2Plus1>& X2,
    107                       ArrayView<std::array<float, kFftLengthBy2Plus1>> R2) {
    108  const size_t num_capture_channels = R2.size();
    109  for (size_t ch = 0; ch < num_capture_channels; ++ch) {
    110    for (size_t k = 0; k < kFftLengthBy2Plus1; ++k) {
    111      R2[ch][k] = X2[k] * echo_path_gain;
    112    }
    113  }
    114 }
    115 
    116 // Applies a soft noise gate to the echo generating power.
    117 void ApplyNoiseGate(const EchoCanceller3Config::EchoModel& config,
    118                    ArrayView<float, kFftLengthBy2Plus1> X2) {
    119  for (size_t k = 0; k < kFftLengthBy2Plus1; ++k) {
    120    if (config.noise_gate_power > X2[k]) {
    121      X2[k] = std::max(0.f, X2[k] - config.noise_gate_slope *
    122                                        (config.noise_gate_power - X2[k]));
    123    }
    124  }
    125 }
    126 
    127 // Estimates the echo generating signal power as gated maximal power over a
    128 // time window.
    129 void EchoGeneratingPower(size_t num_render_channels,
    130                         const SpectrumBuffer& spectrum_buffer,
    131                         const EchoCanceller3Config::EchoModel& echo_model,
    132                         int filter_delay_blocks,
    133                         ArrayView<float, kFftLengthBy2Plus1> X2) {
    134  int idx_stop;
    135  int idx_start;
    136  GetRenderIndexesToAnalyze(spectrum_buffer, echo_model, filter_delay_blocks,
    137                            &idx_start, &idx_stop);
    138 
    139  std::fill(X2.begin(), X2.end(), 0.f);
    140  if (num_render_channels == 1) {
    141    for (int k = idx_start; k != idx_stop; k = spectrum_buffer.IncIndex(k)) {
    142      for (size_t j = 0; j < kFftLengthBy2Plus1; ++j) {
    143        X2[j] = std::max(X2[j], spectrum_buffer.buffer[k][/*channel=*/0][j]);
    144      }
    145    }
    146  } else {
    147    for (int k = idx_start; k != idx_stop; k = spectrum_buffer.IncIndex(k)) {
    148      std::array<float, kFftLengthBy2Plus1> render_power;
    149      render_power.fill(0.f);
    150      for (size_t ch = 0; ch < num_render_channels; ++ch) {
    151        const auto& channel_power = spectrum_buffer.buffer[k][ch];
    152        for (size_t j = 0; j < kFftLengthBy2Plus1; ++j) {
    153          render_power[j] += channel_power[j];
    154        }
    155      }
    156      for (size_t j = 0; j < kFftLengthBy2Plus1; ++j) {
    157        X2[j] = std::max(X2[j], render_power[j]);
    158      }
    159    }
    160  }
    161 }
    162 
    163 }  // namespace
    164 
    165 ResidualEchoEstimator::ResidualEchoEstimator(
    166    const Environment& env,
    167    const EchoCanceller3Config& config,
    168    size_t num_render_channels,
    169    NeuralResidualEchoEstimator* neural_residual_echo_estimator)
    170    : config_(config),
    171      num_render_channels_(num_render_channels),
    172      early_reflections_transparent_mode_gain_(GetTransparentModeGain()),
    173      late_reflections_transparent_mode_gain_(GetTransparentModeGain()),
    174      early_reflections_general_gain_(
    175          GetEarlyReflectionsDefaultModeGain(env.field_trials(),
    176                                             config_.ep_strength)),
    177      late_reflections_general_gain_(
    178          GetLateReflectionsDefaultModeGain(env.field_trials(),
    179                                            config_.ep_strength)),
    180      erle_onset_compensation_in_dominant_nearend_(
    181          UseErleOnsetCompensationInDominantNearend(env.field_trials(),
    182                                                    config_.ep_strength)),
    183      neural_residual_echo_estimator_(neural_residual_echo_estimator) {
    184  Reset();
    185 }
    186 
    187 ResidualEchoEstimator::~ResidualEchoEstimator() = default;
    188 
    189 void ResidualEchoEstimator::Estimate(
    190    const AecState& aec_state,
    191    const RenderBuffer& render_buffer,
    192    ArrayView<const std::array<float, kFftLengthBy2>> capture,
    193    ArrayView<const std::array<float, kFftLengthBy2>> linear_aec_output,
    194    ArrayView<const std::array<float, kFftLengthBy2Plus1>> S2_linear,
    195    ArrayView<const std::array<float, kFftLengthBy2Plus1>> Y2,
    196    ArrayView<const std::array<float, kFftLengthBy2Plus1>> E2,
    197    bool dominant_nearend,
    198    ArrayView<std::array<float, kFftLengthBy2Plus1>> R2,
    199    ArrayView<std::array<float, kFftLengthBy2Plus1>> R2_unbounded) {
    200  RTC_DCHECK_EQ(R2.size(), Y2.size());
    201  RTC_DCHECK_EQ(R2.size(), S2_linear.size());
    202 
    203  const size_t num_capture_channels = R2.size();
    204 
    205  // Estimate the power of the stationary noise in the render signal.
    206  UpdateRenderNoisePower(render_buffer);
    207 
    208  // The neural residual echo estimation always runs, even if the estimated
    209  // spectra |R2| and |R2_unbounded| are overwritten later. This ensures the
    210  // estimator sees continuous signals at a constant time rate.
    211  if (neural_residual_echo_estimator_ != nullptr) {
    212    constexpr int kNeuralDelayHeadroomMs = 12;
    213    constexpr int kNeuralDelayHeadroomBlocks =
    214        kNeuralDelayHeadroomMs / kBlockSizeMs;
    215    constexpr int kJitterMarginBlocks = 3;
    216    std::optional<DelayEstimate> external_delay_blocks =
    217        aec_state.ExternalDelayBlocks();
    218    int headroom_blocks = 0;
    219    int headroom_render_buffer = render_buffer.Headroom();
    220    if (external_delay_blocks &&
    221        external_delay_blocks->delay >
    222            kNeuralDelayHeadroomBlocks + kJitterMarginBlocks &&
    223        headroom_render_buffer > 0) {
    224      headroom_blocks =
    225          std::min(headroom_render_buffer - 1, kNeuralDelayHeadroomBlocks);
    226    }
    227    ArrayView<const float> render =
    228        render_buffer.GetBlock(headroom_blocks).View(/*band=*/0, /*ch=*/0);
    229    neural_residual_echo_estimator_->Estimate(render, capture,
    230                                              linear_aec_output, S2_linear, Y2,
    231                                              E2, R2, R2_unbounded);
    232  }
    233 
    234  // Estimate the residual echo power.
    235  if (aec_state.UsableLinearEstimate()) {
    236    if (neural_residual_echo_estimator_ == nullptr) {
    237      // When there is saturated echo, assume the same spectral content as is
    238      // present in the microphone signal.
    239      if (aec_state.SaturatedEcho()) {
    240        for (size_t ch = 0; ch < num_capture_channels; ++ch) {
    241          std::copy(Y2[ch].begin(), Y2[ch].end(), R2[ch].begin());
    242          std::copy(Y2[ch].begin(), Y2[ch].end(), R2_unbounded[ch].begin());
    243        }
    244      } else {
    245        const bool onset_compensated =
    246            erle_onset_compensation_in_dominant_nearend_ || !dominant_nearend;
    247        LinearEstimate(S2_linear, aec_state.Erle(onset_compensated), R2);
    248        LinearEstimate(S2_linear, aec_state.ErleUnbounded(), R2_unbounded);
    249      }
    250 
    251      UpdateReverb(ReverbType::kLinear, aec_state, render_buffer,
    252                   dominant_nearend);
    253      AddReverb(R2);
    254      AddReverb(R2_unbounded);
    255    }
    256  } else {
    257    const float echo_path_gain =
    258        GetEchoPathGain(aec_state, /*gain_for_early_reflections=*/true);
    259 
    260    // When there is saturated echo, assume the same spectral content as is
    261    // present in the microphone signal.
    262    if (aec_state.SaturatedEcho()) {
    263      for (size_t ch = 0; ch < num_capture_channels; ++ch) {
    264        std::copy(Y2[ch].begin(), Y2[ch].end(), R2[ch].begin());
    265        std::copy(Y2[ch].begin(), Y2[ch].end(), R2_unbounded[ch].begin());
    266      }
    267    } else {
    268      // Estimate the echo generating signal power.
    269      std::array<float, kFftLengthBy2Plus1> X2;
    270      EchoGeneratingPower(num_render_channels_,
    271                          render_buffer.GetSpectrumBuffer(), config_.echo_model,
    272                          aec_state.MinDirectPathFilterDelay(), X2);
    273      if (!aec_state.UseStationarityProperties()) {
    274        ApplyNoiseGate(config_.echo_model, X2);
    275      }
    276 
    277      // Subtract the stationary noise power to avoid stationary noise causing
    278      // excessive echo suppression.
    279      for (size_t k = 0; k < kFftLengthBy2Plus1; ++k) {
    280        X2[k] -= config_.echo_model.stationary_gate_slope * X2_noise_floor_[k];
    281        X2[k] = std::max(0.f, X2[k]);
    282      }
    283 
    284      NonLinearEstimate(echo_path_gain, X2, R2);
    285      NonLinearEstimate(echo_path_gain, X2, R2_unbounded);
    286    }
    287 
    288    if (config_.echo_model.model_reverb_in_nonlinear_mode &&
    289        !aec_state.TransparentModeActive()) {
    290      UpdateReverb(ReverbType::kNonLinear, aec_state, render_buffer,
    291                   dominant_nearend);
    292      AddReverb(R2);
    293      AddReverb(R2_unbounded);
    294    }
    295  }
    296 
    297  if (aec_state.UseStationarityProperties()) {
    298    // Scale the echo according to echo audibility.
    299    std::array<float, kFftLengthBy2Plus1> residual_scaling;
    300    aec_state.GetResidualEchoScaling(residual_scaling);
    301    for (size_t ch = 0; ch < num_capture_channels; ++ch) {
    302      for (size_t k = 0; k < kFftLengthBy2Plus1; ++k) {
    303        R2[ch][k] *= residual_scaling[k];
    304        R2_unbounded[ch][k] *= residual_scaling[k];
    305      }
    306    }
    307  }
    308 }
    309 
    310 void ResidualEchoEstimator::Reset() {
    311  echo_reverb_.Reset();
    312  X2_noise_floor_counter_.fill(config_.echo_model.noise_floor_hold);
    313  X2_noise_floor_.fill(config_.echo_model.min_noise_floor_power);
    314 }
    315 
    316 void ResidualEchoEstimator::UpdateRenderNoisePower(
    317    const RenderBuffer& render_buffer) {
    318  std::array<float, kFftLengthBy2Plus1> render_power_data;
    319  ArrayView<const std::array<float, kFftLengthBy2Plus1>> X2 =
    320      render_buffer.Spectrum(0);
    321  ArrayView<const float, kFftLengthBy2Plus1> render_power = X2[/*channel=*/0];
    322  if (num_render_channels_ > 1) {
    323    render_power_data.fill(0.f);
    324    for (size_t ch = 0; ch < num_render_channels_; ++ch) {
    325      const auto& channel_power = X2[ch];
    326      for (size_t k = 0; k < kFftLengthBy2Plus1; ++k) {
    327        render_power_data[k] += channel_power[k];
    328      }
    329    }
    330    render_power = render_power_data;
    331  }
    332 
    333  // Estimate the stationary noise power in a minimum statistics manner.
    334  for (size_t k = 0; k < kFftLengthBy2Plus1; ++k) {
    335    // Decrease rapidly.
    336    if (render_power[k] < X2_noise_floor_[k]) {
    337      X2_noise_floor_[k] = render_power[k];
    338      X2_noise_floor_counter_[k] = 0;
    339    } else {
    340      // Increase in a delayed, leaky manner.
    341      if (X2_noise_floor_counter_[k] >=
    342          static_cast<int>(config_.echo_model.noise_floor_hold)) {
    343        X2_noise_floor_[k] = std::max(X2_noise_floor_[k] * 1.1f,
    344                                      config_.echo_model.min_noise_floor_power);
    345      } else {
    346        ++X2_noise_floor_counter_[k];
    347      }
    348    }
    349  }
    350 }
    351 
    352 // Updates the reverb estimation.
    353 void ResidualEchoEstimator::UpdateReverb(ReverbType reverb_type,
    354                                         const AecState& aec_state,
    355                                         const RenderBuffer& render_buffer,
    356                                         bool dominant_nearend) {
    357  // Choose reverb partition based on what type of echo power model is used.
    358  const size_t first_reverb_partition =
    359      reverb_type == ReverbType::kLinear
    360          ? aec_state.FilterLengthBlocks() + 1
    361          : aec_state.MinDirectPathFilterDelay() + 1;
    362 
    363  // Compute render power for the reverb.
    364  std::array<float, kFftLengthBy2Plus1> render_power_data;
    365  ArrayView<const std::array<float, kFftLengthBy2Plus1>> X2 =
    366      render_buffer.Spectrum(first_reverb_partition);
    367  ArrayView<const float, kFftLengthBy2Plus1> render_power = X2[/*channel=*/0];
    368  if (num_render_channels_ > 1) {
    369    render_power_data.fill(0.f);
    370    for (size_t ch = 0; ch < num_render_channels_; ++ch) {
    371      const auto& channel_power = X2[ch];
    372      for (size_t k = 0; k < kFftLengthBy2Plus1; ++k) {
    373        render_power_data[k] += channel_power[k];
    374      }
    375    }
    376    render_power = render_power_data;
    377  }
    378 
    379  // Update the reverb estimate.
    380  float reverb_decay = aec_state.ReverbDecay(/*mild=*/dominant_nearend);
    381  if (reverb_type == ReverbType::kLinear) {
    382    echo_reverb_.UpdateReverb(
    383        render_power, aec_state.GetReverbFrequencyResponse(), reverb_decay);
    384  } else {
    385    const float echo_path_gain =
    386        GetEchoPathGain(aec_state, /*gain_for_early_reflections=*/false);
    387    echo_reverb_.UpdateReverbNoFreqShaping(render_power, echo_path_gain,
    388                                           reverb_decay);
    389  }
    390 }
    391 // Adds the estimated power of the reverb to the residual echo power.
    392 void ResidualEchoEstimator::AddReverb(
    393    ArrayView<std::array<float, kFftLengthBy2Plus1>> R2) const {
    394  const size_t num_capture_channels = R2.size();
    395 
    396  // Add the reverb power.
    397  ArrayView<const float, kFftLengthBy2Plus1> reverb_power =
    398      echo_reverb_.reverb();
    399  for (size_t ch = 0; ch < num_capture_channels; ++ch) {
    400    for (size_t k = 0; k < kFftLengthBy2Plus1; ++k) {
    401      R2[ch][k] += reverb_power[k];
    402    }
    403  }
    404 }
    405 
    406 // Chooses the echo path gain to use.
    407 float ResidualEchoEstimator::GetEchoPathGain(
    408    const AecState& aec_state,
    409    bool gain_for_early_reflections) const {
    410  float gain_amplitude;
    411  if (aec_state.TransparentModeActive()) {
    412    gain_amplitude = gain_for_early_reflections
    413                         ? early_reflections_transparent_mode_gain_
    414                         : late_reflections_transparent_mode_gain_;
    415  } else {
    416    gain_amplitude = gain_for_early_reflections
    417                         ? early_reflections_general_gain_
    418                         : late_reflections_general_gain_;
    419  }
    420  return gain_amplitude * gain_amplitude;
    421 }
    422 
    423 }  // namespace webrtc