tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

commit f71a85ec1711851e01b6833e8f81c43fad292ff1
parent b1e92b670aff330b0875382d061420fb1fb60d12
Author: Dan Baker <dbaker@mozilla.com>
Date:   Mon,  1 Dec 2025 18:09:30 -0700

Bug 2000941 - Vendor libwebrtc from c223e2036e

Upstream commit: https://webrtc.googlesource.com/src/+/c223e2036ebeb2d0f770ccefc9cfb7efce292a47
    AEC3: Add interface + plumbing for neural residual echo estimation

    This adds support for experimenting with more sophisticated residual
    echo estimation.

    Tested: `audioproc_f --aec=1` is bitexact on a set of 50 aecdumps.
    Credit for original version: devicentepena@webrtc.org

    Bug: webrtc:442444736, b:428638881
    Change-Id: I3df0097ca8673d7f73ab1c68e83b170ecc01de3f
    Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/407446
    Reviewed-by: Sam Zackrisson <saza@webrtc.org>
    Commit-Queue: Sam Zackrisson <saza@webrtc.org>
    Reviewed-by: Per Ã…hgren <peah@webrtc.org>
    Cr-Commit-Position: refs/heads/main@{#45599}

Diffstat:
Mthird_party/libwebrtc/README.mozilla.last-vendor | 4++--
Mthird_party/libwebrtc/api/audio/BUILD.gn | 7+++++++
Mthird_party/libwebrtc/api/audio/builtin_audio_processing_builder.cc | 3++-
Mthird_party/libwebrtc/api/audio/builtin_audio_processing_builder.h | 11+++++++++++
Mthird_party/libwebrtc/api/audio/echo_canceller3_config.cc | 8++++++++
Mthird_party/libwebrtc/api/audio/echo_canceller3_config.h | 5+++++
Mthird_party/libwebrtc/api/audio/echo_canceller3_factory.cc | 7++++---
Athird_party/libwebrtc/api/audio/neural_residual_echo_estimator.h | 52++++++++++++++++++++++++++++++++++++++++++++++++++++
Mthird_party/libwebrtc/modules/audio_processing/BUILD.gn | 1+
Mthird_party/libwebrtc/modules/audio_processing/aec3/BUILD.gn | 2++
Mthird_party/libwebrtc/modules/audio_processing/aec3/aec3_common.h | 1+
Mthird_party/libwebrtc/modules/audio_processing/aec3/aec_state.cc | 1-
Mthird_party/libwebrtc/modules/audio_processing/aec3/aec_state.h | 13+++++++++++--
Mthird_party/libwebrtc/modules/audio_processing/aec3/block_processor.cc | 16++++++++++------
Mthird_party/libwebrtc/modules/audio_processing/aec3/block_processor.h | 7+++++--
Mthird_party/libwebrtc/modules/audio_processing/aec3/block_processor_unittest.cc | 46+++++++++++++++++++++++++---------------------
Mthird_party/libwebrtc/modules/audio_processing/aec3/echo_canceller3.cc | 5++++-
Mthird_party/libwebrtc/modules/audio_processing/aec3/echo_canceller3.h | 3+++
Mthird_party/libwebrtc/modules/audio_processing/aec3/echo_canceller3_unittest.cc | 100++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------
Mthird_party/libwebrtc/modules/audio_processing/aec3/echo_remover.cc | 37+++++++++++++++++++++++++------------
Mthird_party/libwebrtc/modules/audio_processing/aec3/echo_remover.h | 13++++++++-----
Mthird_party/libwebrtc/modules/audio_processing/aec3/echo_remover_unittest.cc | 24++++++++++++++----------
Mthird_party/libwebrtc/modules/audio_processing/aec3/residual_echo_estimator.cc | 74++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------
Mthird_party/libwebrtc/modules/audio_processing/aec3/residual_echo_estimator.h | 13++++++++++---
Mthird_party/libwebrtc/modules/audio_processing/aec3/residual_echo_estimator_unittest.cc | 24+++++++++++++++++++-----
Mthird_party/libwebrtc/modules/audio_processing/aec3/suppression_gain.cc | 28+++++++++++++++++++---------
Mthird_party/libwebrtc/modules/audio_processing/audio_processing_impl.cc | 21+++++++++++++++------
Mthird_party/libwebrtc/modules/audio_processing/audio_processing_impl.h | 14+++++++++++---
Mthird_party/libwebrtc/moz-patch-stack/s0027.patch | 4++--
Mthird_party/libwebrtc/moz-patch-stack/s0034.patch | 6+++---
Mthird_party/libwebrtc/moz-patch-stack/s0102.patch | 2+-
Mthird_party/libwebrtc/test/fuzzers/aec3_fuzzer.cc | 6++++--
32 files changed, 422 insertions(+), 136 deletions(-)

diff --git a/third_party/libwebrtc/README.mozilla.last-vendor b/third_party/libwebrtc/README.mozilla.last-vendor @@ -1,4 +1,4 @@ # ./mach python dom/media/webrtc/third_party_build/vendor-libwebrtc.py --from-local /Users/danielbaker/elm/.moz-fast-forward/moz-libwebrtc --commit mozpatches libwebrtc -libwebrtc updated from /Users/danielbaker/elm/.moz-fast-forward/moz-libwebrtc commit mozpatches on 2025-12-02T01:05:17.702586+00:00. +libwebrtc updated from /Users/danielbaker/elm/.moz-fast-forward/moz-libwebrtc commit mozpatches on 2025-12-02T01:09:12.139655+00:00. # base of lastest vendoring -3bdce3dc17 +c223e2036e diff --git a/third_party/libwebrtc/api/audio/BUILD.gn b/third_party/libwebrtc/api/audio/BUILD.gn @@ -108,6 +108,7 @@ rtc_library("builtin_audio_processing_builder") { deps = [ ":audio_processing", ":echo_control", + ":neural_residual_echo_estimator_api", "..:make_ref_counted", "..:scoped_refptr", "../../modules/audio_processing", @@ -168,6 +169,12 @@ rtc_source_set("echo_control") { ] } +rtc_source_set("neural_residual_echo_estimator_api") { + visibility = [ "*" ] + sources = [ "neural_residual_echo_estimator.h" ] + deps = [ "..:array_view" ] +} + rtc_library("echo_detector_creator") { visibility = [ "*" ] allow_poison = [ "default_echo_detector" ] diff --git a/third_party/libwebrtc/api/audio/builtin_audio_processing_builder.cc b/third_party/libwebrtc/api/audio/builtin_audio_processing_builder.cc @@ -25,7 +25,8 @@ BuiltinAudioProcessingBuilder::Build(const Environment& env) { return make_ref_counted<AudioProcessingImpl>( env, config_, std::move(capture_post_processing_), std::move(render_pre_processing_), std::move(echo_control_factory_), - std::move(echo_detector_), std::move(capture_analyzer_)); + std::move(echo_detector_), std::move(capture_analyzer_), + std::move(neural_residual_echo_estimator_)); } } // namespace webrtc diff --git a/third_party/libwebrtc/api/audio/builtin_audio_processing_builder.h b/third_party/libwebrtc/api/audio/builtin_audio_processing_builder.h @@ -17,6 +17,7 @@ #include "absl/base/nullability.h" #include "api/audio/audio_processing.h" #include "api/audio/echo_control.h" +#include "api/audio/neural_residual_echo_estimator.h" #include "api/environment/environment.h" #include "api/scoped_refptr.h" #include "rtc_base/system/rtc_export.h" @@ -76,6 +77,15 @@ class RTC_EXPORT BuiltinAudioProcessingBuilder return *this; } + // The AudioProcessingBuilder takes ownership of the + // neural_residual_echo_estimator. + BuiltinAudioProcessingBuilder& SetNeuralResidualEchoEstimator( + std::unique_ptr<NeuralResidualEchoEstimator> + neural_residual_echo_estimator) { + neural_residual_echo_estimator_ = std::move(neural_residual_echo_estimator); + return *this; + } + // Creates an APM instance with the specified config or the default one if // unspecified. Injects the specified components transferring the ownership // to the newly created APM instance. @@ -89,6 +99,7 @@ class RTC_EXPORT BuiltinAudioProcessingBuilder std::unique_ptr<CustomProcessing> render_pre_processing_; scoped_refptr<EchoDetector> echo_detector_; std::unique_ptr<CustomAudioAnalyzer> capture_analyzer_; + std::unique_ptr<NeuralResidualEchoEstimator> neural_residual_echo_estimator_; }; } // namespace webrtc diff --git a/third_party/libwebrtc/api/audio/echo_canceller3_config.cc b/third_party/libwebrtc/api/audio/echo_canceller3_config.cc @@ -272,6 +272,14 @@ bool EchoCanceller3Config::Validate(EchoCanceller3Config* config) { res = res & Limit(&c->suppressor.high_bands_suppression.anti_howling_gain, 0.f, 1.f); + res = + res & Limit(&c->suppressor.high_frequency_suppression.limiting_gain_band, + 1, 64); + res = + res & + Limit(&c->suppressor.high_frequency_suppression.bands_in_limiting_gain, 0, + 64 - c->suppressor.high_frequency_suppression.limiting_gain_band); + res = res & Limit(&c->suppressor.floor_first_increase, 0.f, 1000000.f); return res; diff --git a/third_party/libwebrtc/api/audio/echo_canceller3_config.h b/third_party/libwebrtc/api/audio/echo_canceller3_config.h @@ -237,6 +237,11 @@ struct RTC_EXPORT EchoCanceller3Config { float anti_howling_gain = 1.f; } high_bands_suppression; + struct HighFrequencySuppression { + int limiting_gain_band = 16; + int bands_in_limiting_gain = 1; + } high_frequency_suppression; + float floor_first_increase = 0.00001f; bool conservative_hf_suppression = false; } suppressor; diff --git a/third_party/libwebrtc/api/audio/echo_canceller3_factory.cc b/third_party/libwebrtc/api/audio/echo_canceller3_factory.cc @@ -35,9 +35,10 @@ absl_nonnull std::unique_ptr<EchoControl> EchoCanceller3Factory::Create( int sample_rate_hz, int num_render_channels, int num_capture_channels) { - return std::make_unique<EchoCanceller3>(env, config_, multichannel_config_, - sample_rate_hz, num_render_channels, - num_capture_channels); + return std::make_unique<EchoCanceller3>( + env, config_, multichannel_config_, + /*neural_residual_echo_estimator=*/nullptr, sample_rate_hz, + num_render_channels, num_capture_channels); } } // namespace webrtc diff --git a/third_party/libwebrtc/api/audio/neural_residual_echo_estimator.h b/third_party/libwebrtc/api/audio/neural_residual_echo_estimator.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2025 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef API_AUDIO_NEURAL_RESIDUAL_ECHO_ESTIMATOR_H_ +#define API_AUDIO_NEURAL_RESIDUAL_ECHO_ESTIMATOR_H_ + +#include <array> + +#include "api/array_view.h" + +namespace webrtc { + +// Interface for a neural residual echo estimator module injected into the echo +// canceller. +// This estimator estimates the echo residual that is not fully removed by the +// linear AEC3 estimator. +class NeuralResidualEchoEstimator { + public: + virtual ~NeuralResidualEchoEstimator() {} + // Estimates residual echo power spectrum in the signal after linear AEC + // subtraction. Returns two estimates: + // * R2: A conservative estimate. + // * R2_unbounded: A less conservative estimate. + // + // Input signals: + // * x: Render signal (time-domain) + // * y: Microphone signal (time-domain) + // * e: Output from linear subtraction stage (time-domain) + // + // Input power spectra: + // * S2: Linear echo estimate + // * Y2: Microphone input + // * E2: Output of linear stage + virtual void Estimate(ArrayView<const float> x, + ArrayView<const std::array<float, 64>> y, + ArrayView<const std::array<float, 64>> e, + ArrayView<const std::array<float, 65>> S2, + ArrayView<const std::array<float, 65>> Y2, + ArrayView<const std::array<float, 65>> E2, + ArrayView<std::array<float, 65>> R2, + ArrayView<std::array<float, 65>> R2_unbounded) = 0; +}; +} // namespace webrtc + +#endif // API_AUDIO_NEURAL_RESIDUAL_ECHO_ESTIMATOR_H_ diff --git a/third_party/libwebrtc/modules/audio_processing/BUILD.gn b/third_party/libwebrtc/modules/audio_processing/BUILD.gn @@ -185,6 +185,7 @@ rtc_library("audio_processing") { "../../api/audio:audio_processing", "../../api/audio:audio_processing_statistics", "../../api/audio:echo_control", + "../../api/audio:neural_residual_echo_estimator_api", "../../api/environment", "../../api/task_queue", "../../audio/utility:audio_frame_operations", diff --git a/third_party/libwebrtc/modules/audio_processing/aec3/BUILD.gn b/third_party/libwebrtc/modules/audio_processing/aec3/BUILD.gn @@ -144,6 +144,7 @@ rtc_library("aec3") { "../../../api:field_trials_view", "../../../api/audio:aec3_config", "../../../api/audio:echo_control", + "../../../api/audio:neural_residual_echo_estimator_api", "../../../api/environment", "../../../common_audio:common_audio_c", "../../../rtc_base:checks", @@ -317,6 +318,7 @@ if (rtc_include_tests) { "../../../api:field_trials", "../../../api/audio:aec3_config", "../../../api/audio:echo_control", + "../../../api/audio:neural_residual_echo_estimator_api", "../../../api/environment", "../../../api/environment:environment_factory", "../../../rtc_base:checks", diff --git a/third_party/libwebrtc/modules/audio_processing/aec3/aec3_common.h b/third_party/libwebrtc/modules/audio_processing/aec3/aec3_common.h @@ -46,6 +46,7 @@ constexpr size_t kSubFrameLength = kFrameSize / 2; constexpr size_t kBlockSize = kFftLengthBy2; constexpr size_t kBlockSizeLog2 = kFftLengthBy2Log2; +constexpr size_t kBlockSizeMs = kFftLengthBy2 * 1000 / 16000; constexpr size_t kExtendedBlockSize = 2 * kFftLengthBy2; constexpr size_t kMatchedFilterWindowSizeSubBlocks = 32; diff --git a/third_party/libwebrtc/modules/audio_processing/aec3/aec_state.cc b/third_party/libwebrtc/modules/audio_processing/aec3/aec_state.cc @@ -383,7 +383,6 @@ void AecState::FilterDelay::Update( if (external_delay && (!external_delay_ || external_delay_->delay != external_delay->delay)) { external_delay_ = external_delay; - external_delay_reported_ = true; } // Override the estimated delay if it is not certain that the filter has had diff --git a/third_party/libwebrtc/modules/audio_processing/aec3/aec_state.h b/third_party/libwebrtc/modules/audio_processing/aec3/aec_state.h @@ -157,6 +157,10 @@ class AecState { return filter_analyzer_.FilterLengthBlocks(); } + std::optional<DelayEstimate> ExternalDelayBlocks() const { + return delay_state_.ExternalDelayBlocks(); + } + private: static std::atomic<int> instance_count_; std::unique_ptr<ApmDataDumper> data_dumper_; @@ -201,7 +205,13 @@ class AecState { // Returns whether an external delay has been reported to the AecState (from // the delay estimator). - bool ExternalDelayReported() const { return external_delay_reported_; } + bool ExternalDelayReported() const { return external_delay_.has_value(); } + + // Returns the external delay reported to the AecState (from the delay + // estimator). + std::optional<DelayEstimate> ExternalDelayBlocks() const { + return external_delay_; + } // Returns the delay in blocks relative to the beginning of the filter that // corresponds to the direct path of the echo. @@ -220,7 +230,6 @@ class AecState { private: const int delay_headroom_blocks_; - bool external_delay_reported_ = false; std::vector<int> filter_delays_blocks_; int min_filter_delay_; std::optional<DelayEstimate> external_delay_; diff --git a/third_party/libwebrtc/modules/audio_processing/aec3/block_processor.cc b/third_party/libwebrtc/modules/audio_processing/aec3/block_processor.cc @@ -241,7 +241,8 @@ std::unique_ptr<BlockProcessor> BlockProcessor::Create( const EchoCanceller3Config& config, int sample_rate_hz, size_t num_render_channels, - size_t num_capture_channels) { + size_t num_capture_channels, + NeuralResidualEchoEstimator* neural_residual_echo_estimator) { std::unique_ptr<RenderDelayBuffer> render_buffer( RenderDelayBuffer::Create(config, sample_rate_hz, num_render_channels)); std::unique_ptr<RenderDelayController> delay_controller; @@ -249,8 +250,9 @@ std::unique_ptr<BlockProcessor> BlockProcessor::Create( delay_controller.reset(RenderDelayController::Create(config, sample_rate_hz, num_capture_channels)); } - std::unique_ptr<EchoRemover> echo_remover = EchoRemover::Create( - env, config, sample_rate_hz, num_render_channels, num_capture_channels); + std::unique_ptr<EchoRemover> echo_remover = + EchoRemover::Create(env, config, sample_rate_hz, num_render_channels, + num_capture_channels, neural_residual_echo_estimator); return Create(config, sample_rate_hz, num_render_channels, num_capture_channels, std::move(render_buffer), std::move(delay_controller), std::move(echo_remover)); @@ -262,14 +264,16 @@ std::unique_ptr<BlockProcessor> BlockProcessor::Create( int sample_rate_hz, size_t num_render_channels, size_t num_capture_channels, - std::unique_ptr<RenderDelayBuffer> render_buffer) { + std::unique_ptr<RenderDelayBuffer> render_buffer, + NeuralResidualEchoEstimator* neural_residual_echo_estimator) { std::unique_ptr<RenderDelayController> delay_controller; if (!config.delay.use_external_delay_estimator) { delay_controller.reset(RenderDelayController::Create(config, sample_rate_hz, num_capture_channels)); } - std::unique_ptr<EchoRemover> echo_remover = EchoRemover::Create( - env, config, sample_rate_hz, num_render_channels, num_capture_channels); + std::unique_ptr<EchoRemover> echo_remover = + EchoRemover::Create(env, config, sample_rate_hz, num_render_channels, + num_capture_channels, neural_residual_echo_estimator); return Create(config, sample_rate_hz, num_render_channels, num_capture_channels, std::move(render_buffer), std::move(delay_controller), std::move(echo_remover)); diff --git a/third_party/libwebrtc/modules/audio_processing/aec3/block_processor.h b/third_party/libwebrtc/modules/audio_processing/aec3/block_processor.h @@ -17,6 +17,7 @@ #include "api/audio/echo_canceller3_config.h" #include "api/audio/echo_control.h" +#include "api/audio/neural_residual_echo_estimator.h" #include "api/environment/environment.h" #include "modules/audio_processing/aec3/block.h" #include "modules/audio_processing/aec3/echo_remover.h" @@ -33,7 +34,8 @@ class BlockProcessor { const EchoCanceller3Config& config, int sample_rate_hz, size_t num_render_channels, - size_t num_capture_channels); + size_t num_capture_channels, + NeuralResidualEchoEstimator* neural_residual_echo_estimator); // Only used for testing purposes. static std::unique_ptr<BlockProcessor> Create( const Environment& env, @@ -41,7 +43,8 @@ class BlockProcessor { int sample_rate_hz, size_t num_render_channels, size_t num_capture_channels, - std::unique_ptr<RenderDelayBuffer> render_buffer); + std::unique_ptr<RenderDelayBuffer> render_buffer, + NeuralResidualEchoEstimator* neural_residual_echo_estimator); static std::unique_ptr<BlockProcessor> Create( const EchoCanceller3Config& config, int sample_rate_hz, diff --git a/third_party/libwebrtc/modules/audio_processing/aec3/block_processor_unittest.cc b/third_party/libwebrtc/modules/audio_processing/aec3/block_processor_unittest.cc @@ -53,9 +53,9 @@ void RunBasicSetupAndApiCallTest(const Environment& env, constexpr size_t kNumRenderChannels = 1; constexpr size_t kNumCaptureChannels = 1; - std::unique_ptr<BlockProcessor> block_processor = - BlockProcessor::Create(env, EchoCanceller3Config(), sample_rate_hz, - kNumRenderChannels, kNumCaptureChannels); + std::unique_ptr<BlockProcessor> block_processor = BlockProcessor::Create( + env, EchoCanceller3Config(), sample_rate_hz, kNumRenderChannels, + kNumCaptureChannels, /*neural_residual_echo_estimator=*/nullptr); Block block(NumBandsForRate(sample_rate_hz), kNumRenderChannels, 1000.f); for (int k = 0; k < num_iterations; ++k) { block_processor->BufferRender(block); @@ -70,9 +70,9 @@ void RunRenderBlockSizeVerificationTest(const Environment& env, constexpr size_t kNumRenderChannels = 1; constexpr size_t kNumCaptureChannels = 1; - std::unique_ptr<BlockProcessor> block_processor = - BlockProcessor::Create(env, EchoCanceller3Config(), sample_rate_hz, - kNumRenderChannels, kNumCaptureChannels); + std::unique_ptr<BlockProcessor> block_processor = BlockProcessor::Create( + env, EchoCanceller3Config(), sample_rate_hz, kNumRenderChannels, + kNumCaptureChannels, /*neural_residual_echo_estimator=*/nullptr); Block block(NumBandsForRate(sample_rate_hz), kNumRenderChannels); EXPECT_DEATH(block_processor->BufferRender(block), ""); @@ -86,9 +86,9 @@ void RunRenderNumBandsVerificationTest(const Environment& env, const size_t wrong_num_bands = NumBandsForRate(sample_rate_hz) < 3 ? NumBandsForRate(sample_rate_hz) + 1 : 1; - std::unique_ptr<BlockProcessor> block_processor = - BlockProcessor::Create(env, EchoCanceller3Config(), sample_rate_hz, - kNumRenderChannels, kNumCaptureChannels); + std::unique_ptr<BlockProcessor> block_processor = BlockProcessor::Create( + env, EchoCanceller3Config(), sample_rate_hz, kNumRenderChannels, + kNumCaptureChannels, /*neural_residual_echo_estimator=*/nullptr); Block block(wrong_num_bands, kNumRenderChannels); EXPECT_DEATH(block_processor->BufferRender(block), ""); @@ -102,9 +102,9 @@ void RunCaptureNumBandsVerificationTest(const Environment& env, const size_t wrong_num_bands = NumBandsForRate(sample_rate_hz) < 3 ? NumBandsForRate(sample_rate_hz) + 1 : 1; - std::unique_ptr<BlockProcessor> block_processor = - BlockProcessor::Create(env, EchoCanceller3Config(), sample_rate_hz, - kNumRenderChannels, kNumCaptureChannels); + std::unique_ptr<BlockProcessor> block_processor = BlockProcessor::Create( + env, EchoCanceller3Config(), sample_rate_hz, kNumRenderChannels, + kNumCaptureChannels, /*neural_residual_echo_estimator=*/nullptr); Block block(wrong_num_bands, kNumRenderChannels); EXPECT_DEATH(block_processor->ProcessCapture(false, false, nullptr, &block), @@ -155,7 +155,8 @@ TEST(BlockProcessor, DISABLED_DelayControllerIntegration) { .WillRepeatedly(Return(0)); std::unique_ptr<BlockProcessor> block_processor = BlockProcessor::Create( env, EchoCanceller3Config(), rate, kNumRenderChannels, - kNumCaptureChannels, std::move(render_delay_buffer_mock)); + kNumCaptureChannels, std::move(render_delay_buffer_mock), + /*neural_residual_echo_estimator=*/nullptr); Block render_block(NumBandsForRate(rate), kNumRenderChannels); Block capture_block(NumBandsForRate(rate), kNumCaptureChannels); @@ -269,19 +270,21 @@ TEST(BlockProcessorDeathTest, VerifyCaptureNumBandsCheck) { // Verifiers that the verification for null ProcessCapture input works. TEST(BlockProcessorDeathTest, NullProcessCaptureParameter) { - EXPECT_DEATH(BlockProcessor::Create(CreateEnvironment(), - EchoCanceller3Config(), 16000, 1, 1) - ->ProcessCapture(false, false, nullptr, nullptr), - ""); + EXPECT_DEATH( + BlockProcessor::Create(CreateEnvironment(), EchoCanceller3Config(), 16000, + 1, 1, /*neural_residual_echo_estimator=*/nullptr) + ->ProcessCapture(false, false, nullptr, nullptr), + ""); } // Verifies the check for correct sample rate. // TODO(peah): Re-enable the test once the issue with memory leaks during DEATH // tests on test bots has been fixed. TEST(BlockProcessor, DISABLED_WrongSampleRate) { - EXPECT_DEATH(BlockProcessor::Create(CreateEnvironment(), - EchoCanceller3Config(), 8001, 1, 1), - ""); + EXPECT_DEATH( + BlockProcessor::Create(CreateEnvironment(), EchoCanceller3Config(), 8001, + 1, 1, /*neural_residual_echo_estimator=*/nullptr), + ""); } #endif @@ -355,4 +358,4 @@ TEST(BlockProcessor, ExternalDelayAppliedCorrectlyWithInitialCaptureCalls) { block_processor->ProcessCapture(false, false, nullptr, &capture_block); } -} // namespace webrtc +} // namespace webrtc +\ No newline at end of file diff --git a/third_party/libwebrtc/modules/audio_processing/aec3/echo_canceller3.cc b/third_party/libwebrtc/modules/audio_processing/aec3/echo_canceller3.cc @@ -748,6 +748,7 @@ EchoCanceller3::EchoCanceller3( const Environment& env, const EchoCanceller3Config& config, const std::optional<EchoCanceller3Config>& multichannel_config, + NeuralResidualEchoEstimator* neural_residual_echo_estimator, int sample_rate_hz, size_t num_render_channels, size_t num_capture_channels) @@ -770,6 +771,7 @@ EchoCanceller3::EchoCanceller3( .multi_channel.stereo_detection_timeout_threshold_seconds, config_selector_.active_config() .multi_channel.stereo_detection_hysteresis_seconds), + neural_residual_echo_estimator_(neural_residual_echo_estimator), output_framer_(num_bands_, num_capture_channels_), capture_blocker_(num_bands_, num_capture_channels_), render_transfer_queue_( @@ -843,7 +845,8 @@ void EchoCanceller3::Initialize() { block_processor_ = BlockProcessor::Create( env_, config_selector_.active_config(), sample_rate_hz_, - num_render_channels_to_aec_, num_capture_channels_); + num_render_channels_to_aec_, num_capture_channels_, + neural_residual_echo_estimator_); render_sub_frame_view_ = std::vector<std::vector<ArrayView<float>>>( num_bands_, std::vector<ArrayView<float>>(num_render_channels_to_aec_)); diff --git a/third_party/libwebrtc/modules/audio_processing/aec3/echo_canceller3.h b/third_party/libwebrtc/modules/audio_processing/aec3/echo_canceller3.h @@ -21,6 +21,7 @@ #include "api/array_view.h" #include "api/audio/echo_canceller3_config.h" #include "api/audio/echo_control.h" +#include "api/audio/neural_residual_echo_estimator.h" #include "api/environment/environment.h" #include "api/field_trials_view.h" #include "modules/audio_processing/aec3/api_call_jitter_metrics.h" @@ -95,6 +96,7 @@ class EchoCanceller3 : public EchoControl { EchoCanceller3(const Environment& env, const EchoCanceller3Config& config, const std::optional<EchoCanceller3Config>& multichannel_config, + NeuralResidualEchoEstimator* neural_residual_echo_estimator, int sample_rate_hz, size_t num_render_channels, size_t num_capture_channels); @@ -198,6 +200,7 @@ class EchoCanceller3 : public EchoControl { const size_t num_capture_channels_; ConfigSelector config_selector_; MultiChannelContentDetector multichannel_content_detector_; + NeuralResidualEchoEstimator* neural_residual_echo_estimator_; std::unique_ptr<BlockFramer> linear_output_framer_ RTC_GUARDED_BY(capture_race_checker_); BlockFramer output_framer_ RTC_GUARDED_BY(capture_race_checker_); diff --git a/third_party/libwebrtc/modules/audio_processing/aec3/echo_canceller3_unittest.cc b/third_party/libwebrtc/modules/audio_processing/aec3/echo_canceller3_unittest.cc @@ -251,8 +251,9 @@ class EchoCanceller3Tester { // output. void RunCaptureTransportVerificationTest() { EchoCanceller3 aec3(CreateEnvironment(), EchoCanceller3Config(), - /*multichannel_config=*/std::nullopt, sample_rate_hz_, - 1, 1); + /*multichannel_config=*/std::nullopt, + /*neural_residual_echo_estimator=*/nullptr, + sample_rate_hz_, 1, 1); aec3.SetBlockProcessorForTesting( std::make_unique<CaptureTransportVerificationProcessor>(num_bands_)); @@ -277,8 +278,9 @@ class EchoCanceller3Tester { // block processor. void RunRenderTransportVerificationTest() { EchoCanceller3 aec3(CreateEnvironment(), EchoCanceller3Config(), - /*multichannel_config=*/std::nullopt, sample_rate_hz_, - 1, 1); + /*multichannel_config=*/std::nullopt, + /*neural_residual_echo_estimator=*/nullptr, + sample_rate_hz_, 1, 1); aec3.SetBlockProcessorForTesting( std::make_unique<RenderTransportVerificationProcessor>(num_bands_)); @@ -347,8 +349,9 @@ class EchoCanceller3Tester { } EchoCanceller3 aec3(CreateEnvironment(), EchoCanceller3Config(), - /*multichannel_config=*/std::nullopt, sample_rate_hz_, - 1, 1); + /*multichannel_config=*/std::nullopt, + /*neural_residual_echo_estimator=*/nullptr, + sample_rate_hz_, 1, 1); aec3.SetBlockProcessorForTesting(std::move(block_processor_mock)); for (size_t frame_index = 0; frame_index < kNumFramesToProcess; @@ -428,8 +431,9 @@ class EchoCanceller3Tester { } EchoCanceller3 aec3(CreateEnvironment(), EchoCanceller3Config(), - /*multichannel_config=*/std::nullopt, sample_rate_hz_, - 1, 1); + /*multichannel_config=*/std::nullopt, + /*neural_residual_echo_estimator=*/nullptr, + sample_rate_hz_, 1, 1); aec3.SetBlockProcessorForTesting(std::move(block_processor_mock)); for (size_t frame_index = 0; frame_index < kNumFramesToProcess; @@ -515,8 +519,9 @@ class EchoCanceller3Tester { } EchoCanceller3 aec3(CreateEnvironment(), EchoCanceller3Config(), - /*multichannel_config=*/std::nullopt, sample_rate_hz_, - 1, 1); + /*multichannel_config=*/std::nullopt, + /*neural_residual_echo_estimator=*/nullptr, + sample_rate_hz_, 1, 1); aec3.SetBlockProcessorForTesting(std::move(block_processor_mock)); for (size_t frame_index = 0; frame_index < kNumFramesToProcess; ++frame_index) { @@ -556,8 +561,9 @@ class EchoCanceller3Tester { void RunRenderSwapQueueVerificationTest() { const EchoCanceller3Config config; EchoCanceller3 aec3(CreateEnvironment(), config, - /*multichannel_config=*/std::nullopt, sample_rate_hz_, - 1, 1); + /*multichannel_config=*/std::nullopt, + /*neural_residual_echo_estimator=*/nullptr, + sample_rate_hz_, 1, 1); aec3.SetBlockProcessorForTesting( std::make_unique<RenderTransportVerificationProcessor>(num_bands_)); @@ -606,8 +612,9 @@ class EchoCanceller3Tester { // properly reported. void RunRenderPipelineSwapQueueOverrunReturnValueTest() { EchoCanceller3 aec3(CreateEnvironment(), EchoCanceller3Config(), - /*multichannel_config=*/std::nullopt, sample_rate_hz_, - 1, 1); + /*multichannel_config=*/std::nullopt, + /*neural_residual_echo_estimator=*/nullptr, + sample_rate_hz_, 1, 1); constexpr size_t kRenderTransferQueueSize = 30; for (size_t k = 0; k < 2; ++k) { @@ -634,6 +641,7 @@ class EchoCanceller3Tester { const int aec3_sample_rate_hz = sample_rate_hz_ == 48000 ? 32000 : 48000; EchoCanceller3 aec3(CreateEnvironment(), EchoCanceller3Config(), /*multichannel_config=*/std::nullopt, + /*neural_residual_echo_estimator=*/nullptr, aec3_sample_rate_hz, 1, 1); PopulateInputFrame(frame_length_, 0, &render_buffer_.channels_f()[0][0], 0); @@ -649,6 +657,7 @@ class EchoCanceller3Tester { const int aec3_sample_rate_hz = sample_rate_hz_ == 48000 ? 32000 : 48000; EchoCanceller3 aec3(CreateEnvironment(), EchoCanceller3Config(), /*multichannel_config=*/std::nullopt, + /*neural_residual_echo_estimator=*/nullptr, aec3_sample_rate_hz, 1, 1); PopulateInputFrame(frame_length_, num_bands_, 0, &capture_buffer_.split_bands_f(0)[0], 100); @@ -972,6 +981,7 @@ TEST(EchoCanceller3, DetectionOfProperStereo) { kNumBlocksForSurroundConfig; EchoCanceller3 aec3(CreateEnvironment(), mono_config, multichannel_config, + /*neural_residual_echo_estimator=*/nullptr, /*sample_rate_hz=*/kSampleRateHz, /*num_render_channels=*/kNumChannels, /*num_capture_channels=*/kNumChannels); @@ -1020,6 +1030,7 @@ TEST(EchoCanceller3, DetectionOfProperStereoUsingThreshold) { kNumBlocksForSurroundConfig; EchoCanceller3 aec3(CreateEnvironment(), mono_config, multichannel_config, + /*neural_residual_echo_estimator=*/nullptr, /*sample_rate_hz=*/kSampleRateHz, /*num_render_channels=*/kNumChannels, /*num_capture_channels=*/kNumChannels); @@ -1067,6 +1078,7 @@ TEST(EchoCanceller3, DetectionOfProperStereoUsingHysteresis) { kNumBlocksForSurroundConfig; EchoCanceller3 aec3(CreateEnvironment(), mono_config, surround_config, + /*neural_residual_echo_estimator=*/nullptr, /*sample_rate_hz=*/kSampleRateHz, /*num_render_channels=*/kNumChannels, /*num_capture_channels=*/kNumChannels); @@ -1133,6 +1145,7 @@ TEST(EchoCanceller3, StereoContentDetectionForMonoSignals) { /*output_num_channels=*/1); EchoCanceller3 aec3(env, mono_config, multichannel_config, + /*neural_residual_echo_estimator=*/nullptr, /*sample_rate_hz=*/kSampleRateHz, /*num_render_channels=*/1, /*num_capture_channels=*/1); @@ -1150,6 +1163,59 @@ TEST(EchoCanceller3, StereoContentDetectionForMonoSignals) { } } +TEST(EchoCanceller3, InjectedNeuralResidualEchoEstimatorIsUsed) { + class NeuralResidualEchoEstimatorImpl : public NeuralResidualEchoEstimator { + public: + NeuralResidualEchoEstimatorImpl() {} + + void Estimate(ArrayView<const float> render, + ArrayView<const std::array<float, 64>> capture, + ArrayView<const std::array<float, 64>> linear_aec_output, + ArrayView<const std::array<float, 65>> S2_linear, + ArrayView<const std::array<float, 65>> Y2, + ArrayView<const std::array<float, 65>> E2, + ArrayView<std::array<float, 65>> R2, + ArrayView<std::array<float, 65>> R2_unbounded) override { + residual_echo_estimate_requested_ = true; + for (auto& R2_ch : R2) { + R2_ch.fill(0.0f); + } + for (auto& R2_ch : R2_unbounded) { + R2_ch.fill(0.0f); + } + } + bool residual_echo_estimate_requested() const { + return residual_echo_estimate_requested_; + } + + private: + bool residual_echo_estimate_requested_ = false; + }; + + constexpr int kSampleRateHz = 16000; + constexpr int kNumChannels = 1; + NeuralResidualEchoEstimatorImpl neural_residual_echo_estimator; + const Environment env = CreateEnvironment(); + EchoCanceller3Config config; + AudioBuffer buffer(/*input_rate=*/kSampleRateHz, + /*input_num_channels=*/kNumChannels, + /*buffer_rate=*/kSampleRateHz, + /*buffer_num_channels=*/kNumChannels, + /*output_rate=*/kSampleRateHz, + /*output_num_channels=*/kNumChannels); + EchoCanceller3 aec3(env, config, /*multichannel_config=*/std::nullopt, + &neural_residual_echo_estimator, + /*sample_rate_hz=*/kSampleRateHz, + /*num_render_channels=*/kNumChannels, + /*num_capture_input_channels=*/kNumChannels); + constexpr int kNumFramesToProcess = 300; + for (int k = 0; k < kNumFramesToProcess; ++k) { + RunAecInSMono(buffer, aec3, k); + } + EXPECT_TRUE( + neural_residual_echo_estimator.residual_echo_estimate_requested()); +} + #if RTC_DCHECK_IS_ON && GTEST_HAS_DEATH_TEST && !defined(WEBRTC_ANDROID) TEST(EchoCanceller3InputCheckDeathTest, WrongCaptureNumBandsCheckVerification) { @@ -1164,7 +1230,8 @@ TEST(EchoCanceller3InputCheckDeathTest, WrongCaptureNumBandsCheckVerification) { TEST(EchoCanceller3InputCheckDeathTest, NullCaptureProcessingParameter) { EXPECT_DEATH( EchoCanceller3(CreateEnvironment(), EchoCanceller3Config(), - /*multichannel_config_=*/std::nullopt, 16000, 1, 1) + /*multichannel_config_=*/std::nullopt, + /*neural_residual_echo_estimator=*/nullptr, 16000, 1, 1) .ProcessCapture(nullptr, false), ""); } @@ -1176,7 +1243,8 @@ TEST(EchoCanceller3InputCheckDeathTest, DISABLED_WrongSampleRate) { ApmDataDumper data_dumper(0); EXPECT_DEATH( EchoCanceller3(CreateEnvironment(), EchoCanceller3Config(), - /*multichannel_config_=*/std::nullopt, 8001, 1, 1), + /*multichannel_config_=*/std::nullopt, + /*neural_residual_echo_estimator=*/nullptr, 8001, 1, 1), ""); } diff --git a/third_party/libwebrtc/modules/audio_processing/aec3/echo_remover.cc b/third_party/libwebrtc/modules/audio_processing/aec3/echo_remover.cc @@ -114,7 +114,8 @@ class EchoRemoverImpl final : public EchoRemover { const EchoCanceller3Config& config, int sample_rate_hz, size_t num_render_channels, - size_t num_capture_channels); + size_t num_capture_channels, + NeuralResidualEchoEstimator* neural_residual_echo_estimator); ~EchoRemoverImpl() override; EchoRemoverImpl(const EchoRemoverImpl&) = delete; EchoRemoverImpl& operator=(const EchoRemoverImpl&) = delete; @@ -188,11 +189,13 @@ class EchoRemoverImpl final : public EchoRemover { std::atomic<int> EchoRemoverImpl::instance_count_(0); -EchoRemoverImpl::EchoRemoverImpl(const Environment& env, - const EchoCanceller3Config& config, - int sample_rate_hz, - size_t num_render_channels, - size_t num_capture_channels) +EchoRemoverImpl::EchoRemoverImpl( + const Environment& env, + const EchoCanceller3Config& config, + int sample_rate_hz, + size_t num_render_channels, + size_t num_capture_channels, + NeuralResidualEchoEstimator* neural_residual_echo_estimator) : config_(config), fft_(), data_dumper_(new ApmDataDumper(instance_count_.fetch_add(1) + 1)), @@ -217,7 +220,10 @@ EchoRemoverImpl::EchoRemoverImpl(const Environment& env, sample_rate_hz_, num_capture_channels_), render_signal_analyzer_(config_), - residual_echo_estimator_(env, config_, num_render_channels), + residual_echo_estimator_(env, + config_, + num_render_channels, + neural_residual_echo_estimator), aec_state_(env, config_, num_capture_channels_), e_old_(num_capture_channels_, {0.f}), y_old_(num_capture_channels_, {0.f}), @@ -386,6 +392,11 @@ void EchoRemoverImpl::ProcessCapture( Y[ch].Spectrum(optimization_, Y2[ch]); E[ch].Spectrum(optimization_, E2[ch]); } + // `y_old_` and `e_old_` now point to the current block. Though their channel + // layout is already suitable for residual echo estimation, an alias is + // created for clarity. + const auto& y_current = y_old_; + const auto& e_current = e_old_; // Optionally return the linear filter output. if (linear_output) { @@ -418,9 +429,9 @@ void EchoRemoverImpl::ProcessCapture( std::array<float, kFftLengthBy2Plus1> G; if (capture_output_used_) { // Estimate the residual echo power. - residual_echo_estimator_.Estimate(aec_state_, *render_buffer, S2_linear, Y2, - suppression_gain_.IsDominantNearend(), R2, - R2_unbounded); + residual_echo_estimator_.Estimate( + aec_state_, *render_buffer, y_current, e_current, S2_linear, Y2, E2, + suppression_gain_.IsDominantNearend(), R2, R2_unbounded); // Suppressor nearend estimate. if (aec_state_.UsableLinearEstimate()) { @@ -523,9 +534,11 @@ std::unique_ptr<EchoRemover> EchoRemover::Create( const EchoCanceller3Config& config, int sample_rate_hz, size_t num_render_channels, - size_t num_capture_channels) { + size_t num_capture_channels, + NeuralResidualEchoEstimator* neural_residual_echo_estimator) { return std::make_unique<EchoRemoverImpl>( - env, config, sample_rate_hz, num_render_channels, num_capture_channels); + env, config, sample_rate_hz, num_render_channels, num_capture_channels, + neural_residual_echo_estimator); } } // namespace webrtc diff --git a/third_party/libwebrtc/modules/audio_processing/aec3/echo_remover.h b/third_party/libwebrtc/modules/audio_processing/aec3/echo_remover.h @@ -17,6 +17,7 @@ #include "api/audio/echo_canceller3_config.h" #include "api/audio/echo_control.h" +#include "api/audio/neural_residual_echo_estimator.h" #include "api/environment/environment.h" #include "modules/audio_processing/aec3/block.h" #include "modules/audio_processing/aec3/delay_estimate.h" @@ -28,11 +29,13 @@ namespace webrtc { // Class for removing the echo from the capture signal. class EchoRemover { public: - static std::unique_ptr<EchoRemover> Create(const Environment& env, - const EchoCanceller3Config& config, - int sample_rate_hz, - size_t num_render_channels, - size_t num_capture_channels); + static std::unique_ptr<EchoRemover> Create( + const Environment& env, + const EchoCanceller3Config& config, + int sample_rate_hz, + size_t num_render_channels, + size_t num_capture_channels, + NeuralResidualEchoEstimator* neural_residual_echo_estimator); virtual ~EchoRemover() = default; // Get current metrics. diff --git a/third_party/libwebrtc/modules/audio_processing/aec3/echo_remover_unittest.cc b/third_party/libwebrtc/modules/audio_processing/aec3/echo_remover_unittest.cc @@ -66,9 +66,9 @@ TEST_P(EchoRemoverMultiChannel, BasicApiCalls) { std::optional<DelayEstimate> delay_estimate; for (auto rate : {16000, 32000, 48000}) { SCOPED_TRACE(ProduceDebugText(rate)); - std::unique_ptr<EchoRemover> remover = - EchoRemover::Create(env, EchoCanceller3Config(), rate, - num_render_channels, num_capture_channels); + std::unique_ptr<EchoRemover> remover = EchoRemover::Create( + env, EchoCanceller3Config(), rate, num_render_channels, + num_capture_channels, /*neural_residual_echo_estimator=*/nullptr); std::unique_ptr<RenderDelayBuffer> render_buffer(RenderDelayBuffer::Create( EchoCanceller3Config(), rate, num_render_channels)); @@ -96,9 +96,10 @@ TEST_P(EchoRemoverMultiChannel, BasicApiCalls) { // TODO(peah): Re-enable the test once the issue with memory leaks during DEATH // tests on test bots has been fixed. TEST(EchoRemoverDeathTest, DISABLED_WrongSampleRate) { - EXPECT_DEATH(EchoRemover::Create(CreateEnvironment(), EchoCanceller3Config(), - 8001, 1, 1), - ""); + EXPECT_DEATH( + EchoRemover::Create(CreateEnvironment(), EchoCanceller3Config(), 8001, 1, + 1, /*neural_residual_echo_estimator=*/nullptr), + ""); } // Verifies the check for the number of capture bands. @@ -110,7 +111,8 @@ TEST(EchoRemoverDeathTest, DISABLED_WrongCaptureNumBands) { for (auto rate : {16000, 32000, 48000}) { SCOPED_TRACE(ProduceDebugText(rate)); std::unique_ptr<EchoRemover> remover = - EchoRemover::Create(env, EchoCanceller3Config(), rate, 1, 1); + EchoRemover::Create(env, EchoCanceller3Config(), rate, 1, 1, + /*neural_residual_echo_estimator=*/nullptr); std::unique_ptr<RenderDelayBuffer> render_buffer( RenderDelayBuffer::Create(EchoCanceller3Config(), rate, 1)); Block capture(NumBandsForRate(rate == 48000 ? 16000 : rate + 16000), 1); @@ -126,8 +128,9 @@ TEST(EchoRemoverDeathTest, DISABLED_WrongCaptureNumBands) { // Verifies the check for non-null capture block. TEST(EchoRemoverDeathTest, NullCapture) { std::optional<DelayEstimate> delay_estimate; - std::unique_ptr<EchoRemover> remover = EchoRemover::Create( - CreateEnvironment(), EchoCanceller3Config(), 16000, 1, 1); + std::unique_ptr<EchoRemover> remover = + EchoRemover::Create(CreateEnvironment(), EchoCanceller3Config(), 16000, 1, + 1, /*neural_residual_echo_estimator=*/nullptr); std::unique_ptr<RenderDelayBuffer> render_buffer( RenderDelayBuffer::Create(EchoCanceller3Config(), 16000, 1)); EchoPathVariability echo_path_variability( @@ -157,7 +160,8 @@ TEST(EchoRemover, BasicEchoRemoval) { SCOPED_TRACE(ProduceDebugText(rate, delay_samples)); EchoCanceller3Config config; std::unique_ptr<EchoRemover> remover = - EchoRemover::Create(env, config, rate, num_channels, num_channels); + EchoRemover::Create(env, config, rate, num_channels, num_channels, + /*neural_residual_echo_estimator=*/nullptr); std::unique_ptr<RenderDelayBuffer> render_buffer( RenderDelayBuffer::Create(config, rate, num_channels)); render_buffer->AlignFromDelay(delay_samples / kBlockSize); diff --git a/third_party/libwebrtc/modules/audio_processing/aec3/residual_echo_estimator.cc b/third_party/libwebrtc/modules/audio_processing/aec3/residual_echo_estimator.cc @@ -162,9 +162,11 @@ void EchoGeneratingPower(size_t num_render_channels, } // namespace -ResidualEchoEstimator::ResidualEchoEstimator(const Environment& env, - const EchoCanceller3Config& config, - size_t num_render_channels) +ResidualEchoEstimator::ResidualEchoEstimator( + const Environment& env, + const EchoCanceller3Config& config, + size_t num_render_channels, + NeuralResidualEchoEstimator* neural_residual_echo_estimator) : config_(config), num_render_channels_(num_render_channels), early_reflections_transparent_mode_gain_(GetTransparentModeGain()), @@ -177,7 +179,8 @@ ResidualEchoEstimator::ResidualEchoEstimator(const Environment& env, config_.ep_strength)), erle_onset_compensation_in_dominant_nearend_( UseErleOnsetCompensationInDominantNearend(env.field_trials(), - config_.ep_strength)) { + config_.ep_strength)), + neural_residual_echo_estimator_(neural_residual_echo_estimator) { Reset(); } @@ -186,8 +189,11 @@ ResidualEchoEstimator::~ResidualEchoEstimator() = default; void ResidualEchoEstimator::Estimate( const AecState& aec_state, const RenderBuffer& render_buffer, + ArrayView<const std::array<float, kFftLengthBy2>> capture, + ArrayView<const std::array<float, kFftLengthBy2>> linear_aec_output, ArrayView<const std::array<float, kFftLengthBy2Plus1>> S2_linear, ArrayView<const std::array<float, kFftLengthBy2Plus1>> Y2, + ArrayView<const std::array<float, kFftLengthBy2Plus1>> E2, bool dominant_nearend, ArrayView<std::array<float, kFftLengthBy2Plus1>> R2, ArrayView<std::array<float, kFftLengthBy2Plus1>> R2_unbounded) { @@ -199,26 +205,54 @@ void ResidualEchoEstimator::Estimate( // Estimate the power of the stationary noise in the render signal. UpdateRenderNoisePower(render_buffer); + // The neural residual echo estimation always runs, even if the estimated + // spectra |R2| and |R2_unbounded| are overwritten later. This ensures the + // estimator sees continuous signals at a constant time rate. + if (neural_residual_echo_estimator_ != nullptr) { + constexpr int kNeuralDelayHeadroomMs = 12; + constexpr int kNeuralDelayHeadroomBlocks = + kNeuralDelayHeadroomMs / kBlockSizeMs; + constexpr int kJitterMarginBlocks = 3; + std::optional<DelayEstimate> external_delay_blocks = + aec_state.ExternalDelayBlocks(); + int headroom_blocks = 0; + int headroom_render_buffer = render_buffer.Headroom(); + if (external_delay_blocks && + external_delay_blocks->delay > + kNeuralDelayHeadroomBlocks + kJitterMarginBlocks && + headroom_render_buffer > 0) { + headroom_blocks = + std::min(headroom_render_buffer - 1, kNeuralDelayHeadroomBlocks); + } + ArrayView<const float> render = + render_buffer.GetBlock(headroom_blocks).View(/*band=*/0, /*ch=*/0); + neural_residual_echo_estimator_->Estimate(render, capture, + linear_aec_output, S2_linear, Y2, + E2, R2, R2_unbounded); + } + // Estimate the residual echo power. if (aec_state.UsableLinearEstimate()) { - // When there is saturated echo, assume the same spectral content as is - // present in the microphone signal. - if (aec_state.SaturatedEcho()) { - for (size_t ch = 0; ch < num_capture_channels; ++ch) { - std::copy(Y2[ch].begin(), Y2[ch].end(), R2[ch].begin()); - std::copy(Y2[ch].begin(), Y2[ch].end(), R2_unbounded[ch].begin()); + if (neural_residual_echo_estimator_ == nullptr) { + // When there is saturated echo, assume the same spectral content as is + // present in the microphone signal. + if (aec_state.SaturatedEcho()) { + for (size_t ch = 0; ch < num_capture_channels; ++ch) { + std::copy(Y2[ch].begin(), Y2[ch].end(), R2[ch].begin()); + std::copy(Y2[ch].begin(), Y2[ch].end(), R2_unbounded[ch].begin()); + } + } else { + const bool onset_compensated = + erle_onset_compensation_in_dominant_nearend_ || !dominant_nearend; + LinearEstimate(S2_linear, aec_state.Erle(onset_compensated), R2); + LinearEstimate(S2_linear, aec_state.ErleUnbounded(), R2_unbounded); } - } else { - const bool onset_compensated = - erle_onset_compensation_in_dominant_nearend_ || !dominant_nearend; - LinearEstimate(S2_linear, aec_state.Erle(onset_compensated), R2); - LinearEstimate(S2_linear, aec_state.ErleUnbounded(), R2_unbounded); - } - UpdateReverb(ReverbType::kLinear, aec_state, render_buffer, - dominant_nearend); - AddReverb(R2); - AddReverb(R2_unbounded); + UpdateReverb(ReverbType::kLinear, aec_state, render_buffer, + dominant_nearend); + AddReverb(R2); + AddReverb(R2_unbounded); + } } else { const float echo_path_gain = GetEchoPathGain(aec_state, /*gain_for_early_reflections=*/true); diff --git a/third_party/libwebrtc/modules/audio_processing/aec3/residual_echo_estimator.h b/third_party/libwebrtc/modules/audio_processing/aec3/residual_echo_estimator.h @@ -16,6 +16,7 @@ #include "api/array_view.h" #include "api/audio/echo_canceller3_config.h" +#include "api/audio/neural_residual_echo_estimator.h" #include "api/environment/environment.h" #include "modules/audio_processing/aec3/aec3_common.h" #include "modules/audio_processing/aec3/aec_state.h" @@ -26,9 +27,11 @@ namespace webrtc { class ResidualEchoEstimator { public: - ResidualEchoEstimator(const Environment& env, - const EchoCanceller3Config& config, - size_t num_render_channels); + ResidualEchoEstimator( + const Environment& env, + const EchoCanceller3Config& config, + size_t num_render_channels, + NeuralResidualEchoEstimator* neural_residual_echo_estimator); ~ResidualEchoEstimator(); ResidualEchoEstimator(const ResidualEchoEstimator&) = delete; @@ -37,8 +40,11 @@ class ResidualEchoEstimator { void Estimate( const AecState& aec_state, const RenderBuffer& render_buffer, + ArrayView<const std::array<float, kFftLengthBy2>> capture, + ArrayView<const std::array<float, kFftLengthBy2>> linear_aec_output, ArrayView<const std::array<float, kFftLengthBy2Plus1>> S2_linear, ArrayView<const std::array<float, kFftLengthBy2Plus1>> Y2, + ArrayView<const std::array<float, kFftLengthBy2Plus1>> E2, bool dominant_nearend, ArrayView<std::array<float, kFftLengthBy2Plus1>> R2, ArrayView<std::array<float, kFftLengthBy2Plus1>> R2_unbounded); @@ -77,6 +83,7 @@ class ResidualEchoEstimator { std::array<float, kFftLengthBy2Plus1> X2_noise_floor_; std::array<int, kFftLengthBy2Plus1> X2_noise_floor_counter_; ReverbModel echo_reverb_; + NeuralResidualEchoEstimator* neural_residual_echo_estimator_; }; } // namespace webrtc diff --git a/third_party/libwebrtc/modules/audio_processing/aec3/residual_echo_estimator_unittest.cc b/third_party/libwebrtc/modules/audio_processing/aec3/residual_echo_estimator_unittest.cc @@ -10,6 +10,7 @@ #include "modules/audio_processing/aec3/residual_echo_estimator.h" +#include <algorithm> #include <array> #include <cstddef> #include <memory> @@ -48,7 +49,10 @@ class ResidualEchoEstimatorTest { : num_render_channels_(num_render_channels), num_capture_channels_(num_capture_channels), config_(config), - estimator_(env_, config_, num_render_channels_), + estimator_(env_, + config_, + num_render_channels_, + /*neural_residual_echo_estimator=*/nullptr), aec_state_(env_, config_, num_capture_channels_), render_delay_buffer_(RenderDelayBuffer::Create(config_, kSampleRateHz, @@ -65,8 +69,10 @@ class ResidualEchoEstimatorTest { std::vector<float>( GetTimeDomainLength(config_.filter.refined.length_blocks), 0.0f)), + e_(num_capture_channels_), random_generator_(42U), - output_(num_capture_channels_) { + output_(num_capture_channels_), + y_(num_capture_channels_) { for (auto& H2_ch : H2_) { for (auto& H2_k : H2_ch) { H2_k.fill(0.01f); @@ -79,7 +85,9 @@ class ResidualEchoEstimatorTest { subtractor_output.Reset(); subtractor_output.s_refined.fill(100.f); } - y_.fill(0.f); + for (auto& y : y_) { + y.fill(0.f); + } constexpr float kLevel = 10.f; for (auto& E2_refined_ch : E2_refined_) { @@ -101,12 +109,17 @@ class ResidualEchoEstimatorTest { } render_delay_buffer_->PrepareCaptureProcessing(); + for (size_t ch = 0; ch < num_capture_channels_; ++ch) { + std::copy(output_[ch].e_refined.begin(), output_[ch].e_refined.end(), + e_[ch].begin()); + } aec_state_.Update(delay_estimate_, H2_, h_, *render_delay_buffer_->GetRenderBuffer(), E2_refined_, Y2_, output_); estimator_.Estimate(aec_state_, *render_delay_buffer_->GetRenderBuffer(), - S2_linear_, Y2_, dominant_nearend, R2_, R2_unbounded_); + y_, e_, S2_linear_, Y2_, E2_refined_, dominant_nearend, + R2_, R2_unbounded_); } ArrayView<const std::array<float, kFftLengthBy2Plus1>> R2() const { @@ -129,9 +142,10 @@ class ResidualEchoEstimatorTest { Block x_; std::vector<std::vector<std::array<float, kFftLengthBy2Plus1>>> H2_; std::vector<std::vector<float>> h_; + std::vector<std::array<float, kBlockSize>> e_; Random random_generator_; std::vector<SubtractorOutput> output_; - std::array<float, kBlockSize> y_; + std::vector<std::array<float, kBlockSize>> y_; std::optional<DelayEstimate> delay_estimate_; bool first_frame_ = true; }; diff --git a/third_party/libwebrtc/modules/audio_processing/aec3/suppression_gain.cc b/third_party/libwebrtc/modules/audio_processing/aec3/suppression_gain.cc @@ -41,18 +41,29 @@ void LimitLowFrequencyGains(std::array<float, kFftLengthBy2Plus1>* gain) { (*gain)[0] = (*gain)[1] = std::min((*gain)[1], (*gain)[2]); } -void LimitHighFrequencyGains(bool conservative_hf_suppression, +void LimitHighFrequencyGains(const EchoCanceller3Config::Suppressor& config, std::array<float, kFftLengthBy2Plus1>* gain) { // Limit the high frequency gains to avoid echo leakage due to an imperfect // filter. - constexpr size_t kFirstBandToLimit = (64 * 2000) / 8000; - const float min_upper_gain = (*gain)[kFirstBandToLimit]; - std::for_each( - gain->begin() + kFirstBandToLimit + 1, gain->end(), - [min_upper_gain](float& a) { a = std::min(a, min_upper_gain); }); + const int limiting_gain_band = + config.high_frequency_suppression.limiting_gain_band; + const int bands_in_limiting_gain = + config.high_frequency_suppression.bands_in_limiting_gain; + if (bands_in_limiting_gain > 0) { + RTC_DCHECK_GE(limiting_gain_band, 0); + RTC_DCHECK_LE(limiting_gain_band + bands_in_limiting_gain, gain->size()); + float min_upper_gain = 1.f; + for (int band = limiting_gain_band; + band < limiting_gain_band + bands_in_limiting_gain; ++band) { + min_upper_gain = std::min(min_upper_gain, (*gain)[band]); + } + std::for_each( + gain->begin() + limiting_gain_band + 1, gain->end(), + [min_upper_gain](float& a) { a = std::min(a, min_upper_gain); }); + } (*gain)[kFftLengthBy2] = (*gain)[kFftLengthBy2Minus1]; - if (conservative_hf_suppression) { + if (config.conservative_hf_suppression) { // Limits the gain in the frequencies for which the adaptive filter has not // converged. // TODO(peah): Make adaptive to take the actual filter error into account. @@ -315,8 +326,7 @@ void SuppressionGain::LowerBandGain( // dominant nearend. if (!dominant_nearend_detector_->IsNearendState() || clock_drift || config_.suppressor.conservative_hf_suppression) { - LimitHighFrequencyGains(config_.suppressor.conservative_hf_suppression, - gain); + LimitHighFrequencyGains(config_.suppressor, gain); } // Store computed gains. diff --git a/third_party/libwebrtc/modules/audio_processing/audio_processing_impl.cc b/third_party/libwebrtc/modules/audio_processing/audio_processing_impl.cc @@ -444,7 +444,8 @@ AudioProcessingImpl::AudioProcessingImpl(const Environment& env) /*render_pre_processor=*/nullptr, /*echo_control_factory=*/nullptr, /*echo_detector=*/nullptr, - /*capture_analyzer=*/nullptr) {} + /*capture_analyzer=*/nullptr, + /*neural_residual_echo_estimator=*/nullptr) {} std::atomic<int> AudioProcessingImpl::instance_count_(0); @@ -455,7 +456,8 @@ AudioProcessingImpl::AudioProcessingImpl( std::unique_ptr<CustomProcessing> render_pre_processor, std::unique_ptr<EchoControlFactory> echo_control_factory, scoped_refptr<EchoDetector> echo_detector, - std::unique_ptr<CustomAudioAnalyzer> capture_analyzer) + std::unique_ptr<CustomAudioAnalyzer> capture_analyzer, + std::unique_ptr<NeuralResidualEchoEstimator> neural_residual_echo_estimator) : env_(env), data_dumper_(new ApmDataDumper(instance_count_.fetch_add(1) + 1)), use_setup_specific_default_aec3_config_( @@ -472,7 +474,8 @@ AudioProcessingImpl::AudioProcessingImpl( submodules_(std::move(capture_post_processor), std::move(render_pre_processor), std::move(echo_detector), - std::move(capture_analyzer)), + std::move(capture_analyzer), + std::move(neural_residual_echo_estimator)), constants_(!env.field_trials().IsEnabled( "WebRTC-ApmExperimentalMultiChannelRenderKillSwitch"), !env.field_trials().IsEnabled( @@ -493,7 +496,9 @@ AudioProcessingImpl::AudioProcessingImpl( << "\nCapture post processor: " << !!submodules_.capture_post_processor << "\nRender pre processor: " - << !!submodules_.render_pre_processor; + << !!submodules_.render_pre_processor + << "\nNeural residual echo estimator " + << !!submodules_.neural_residual_echo_estimator; if (!DenormalDisabler::IsSupported()) { RTC_LOG(LS_INFO) << "Denormal disabler unsupported"; } @@ -1926,8 +1931,9 @@ void AudioProcessingImpl::InitializeEchoController() { EchoCanceller3Config::CreateDefaultMultichannelConfig(); } submodules_.echo_controller = std::make_unique<EchoCanceller3>( - env_, config, multichannel_config, proc_sample_rate_hz(), - num_reverse_channels(), num_proc_channels()); + env_, config, multichannel_config, + submodules_.neural_residual_echo_estimator.get(), + proc_sample_rate_hz(), num_reverse_channels(), num_proc_channels()); } // Setup the storage for returning the linear AEC output. @@ -2184,6 +2190,9 @@ void AudioProcessingImpl::WriteAecDumpConfigMessage(bool forced) { if (config_.gain_controller2.enabled) { experiments_description += "GainController2;"; } + if (submodules_.neural_residual_echo_estimator) { + experiments_description += "NeuralResidualEchoEstimator;"; + } InternalAPMConfig apm_config; diff --git a/third_party/libwebrtc/modules/audio_processing/audio_processing_impl.h b/third_party/libwebrtc/modules/audio_processing/audio_processing_impl.h @@ -26,6 +26,7 @@ #include "api/audio/audio_processing.h" #include "api/audio/audio_processing_statistics.h" #include "api/audio/echo_control.h" +#include "api/audio/neural_residual_echo_estimator.h" #include "api/environment/environment.h" #include "api/scoped_refptr.h" #include "api/task_queue/task_queue_base.h" @@ -68,7 +69,9 @@ class AudioProcessingImpl : public AudioProcessing { std::unique_ptr<CustomProcessing> render_pre_processor, std::unique_ptr<EchoControlFactory> echo_control_factory, scoped_refptr<EchoDetector> echo_detector, - std::unique_ptr<CustomAudioAnalyzer> capture_analyzer); + std::unique_ptr<CustomAudioAnalyzer> capture_analyzer, + std::unique_ptr<NeuralResidualEchoEstimator> + neural_residual_echo_estimator); ~AudioProcessingImpl() override; int Initialize() override; int Initialize(const ProcessingConfig& processing_config) override; @@ -367,11 +370,15 @@ class AudioProcessingImpl : public AudioProcessing { Submodules(std::unique_ptr<CustomProcessing> capture_post_processor, std::unique_ptr<CustomProcessing> render_pre_processor, scoped_refptr<EchoDetector> echo_detector, - std::unique_ptr<CustomAudioAnalyzer> capture_analyzer) + std::unique_ptr<CustomAudioAnalyzer> capture_analyzer, + std::unique_ptr<NeuralResidualEchoEstimator> + neural_residual_echo_estimator) : echo_detector(std::move(echo_detector)), capture_post_processor(std::move(capture_post_processor)), render_pre_processor(std::move(render_pre_processor)), - capture_analyzer(std::move(capture_analyzer)) {} + capture_analyzer(std::move(capture_analyzer)), + neural_residual_echo_estimator( + std::move(neural_residual_echo_estimator)) {} // Accessed internally from capture or during initialization. const scoped_refptr<EchoDetector> echo_detector; const std::unique_ptr<CustomProcessing> capture_post_processor; @@ -386,6 +393,7 @@ class AudioProcessingImpl : public AudioProcessing { std::unique_ptr<NoiseSuppressor> noise_suppressor; std::unique_ptr<PostFilter> post_filter; std::unique_ptr<CaptureLevelsAdjuster> capture_levels_adjuster; + std::unique_ptr<NeuralResidualEchoEstimator> neural_residual_echo_estimator; } submodules_; // State that is written to while holding both the render and capture locks diff --git a/third_party/libwebrtc/moz-patch-stack/s0027.patch b/third_party/libwebrtc/moz-patch-stack/s0027.patch @@ -816,10 +816,10 @@ index 8e8a4bff5f..e57e55882c 100644 testonly = true diff --git a/modules/audio_processing/aec3/BUILD.gn b/modules/audio_processing/aec3/BUILD.gn -index 0ebeecd30f..4d7dc5d9cb 100644 +index e864c4dfe2..c12c4b653f 100644 --- a/modules/audio_processing/aec3/BUILD.gn +++ b/modules/audio_processing/aec3/BUILD.gn -@@ -263,14 +263,11 @@ if (current_cpu == "x86" || current_cpu == "x64") { +@@ -264,14 +264,11 @@ if (current_cpu == "x86" || current_cpu == "x64") { "vector_math_avx2.cc", ] diff --git a/third_party/libwebrtc/moz-patch-stack/s0034.patch b/third_party/libwebrtc/moz-patch-stack/s0034.patch @@ -172,7 +172,7 @@ index ac862c65a8..e66ed2796e 100644 } else { sources += [ "spl_sqrt_floor.c" ] diff --git a/modules/audio_processing/aec3/BUILD.gn b/modules/audio_processing/aec3/BUILD.gn -index 4d7dc5d9cb..847e5ccbea 100644 +index c12c4b653f..fd9bd4f298 100644 --- a/modules/audio_processing/aec3/BUILD.gn +++ b/modules/audio_processing/aec3/BUILD.gn @@ -123,7 +123,7 @@ rtc_library("aec3") { @@ -184,7 +184,7 @@ index 4d7dc5d9cb..847e5ccbea 100644 suppressed_configs += [ "//build/config/compiler:compiler_arm_fpu" ] cflags = [ "-mfpu=neon" ] } -@@ -161,7 +161,7 @@ rtc_library("aec3") { +@@ -162,7 +162,7 @@ rtc_library("aec3") { "//third_party/abseil-cpp/absl/strings:string_view", ] @@ -193,7 +193,7 @@ index 4d7dc5d9cb..847e5ccbea 100644 deps += [ ":aec3_avx2" ] } } -@@ -252,7 +252,7 @@ rtc_source_set("fft_data") { +@@ -253,7 +253,7 @@ rtc_source_set("fft_data") { ] } diff --git a/third_party/libwebrtc/moz-patch-stack/s0102.patch b/third_party/libwebrtc/moz-patch-stack/s0102.patch @@ -455,7 +455,7 @@ index e57e55882c..80af18a5fa 100644 config("audio_device_warnings_config") { diff --git a/modules/audio_processing/aec3/BUILD.gn b/modules/audio_processing/aec3/BUILD.gn -index 847e5ccbea..2f6be208d2 100644 +index fd9bd4f298..f2348fb71c 100644 --- a/modules/audio_processing/aec3/BUILD.gn +++ b/modules/audio_processing/aec3/BUILD.gn @@ -124,7 +124,7 @@ rtc_library("aec3") { diff --git a/third_party/libwebrtc/test/fuzzers/aec3_fuzzer.cc b/third_party/libwebrtc/test/fuzzers/aec3_fuzzer.cc @@ -59,8 +59,10 @@ void FuzzOneInput(const uint8_t* data, size_t size) { 1 + fuzz_data.ReadOrDefaultValue<uint8_t>(0) % (kMaxNumChannels - 1); EchoCanceller3 aec3(CreateEnvironment(), EchoCanceller3Config(), - /*multichannel_config=*/std::nullopt, sample_rate_hz, - num_render_channels, num_capture_channels); + /*multichannel_config=*/std::nullopt, + /*neural_residual_echo_estimator=*/nullptr, + sample_rate_hz, num_render_channels, + num_capture_channels); AudioBuffer capture_audio(sample_rate_hz, num_capture_channels, sample_rate_hz, num_capture_channels,