tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

speech_level_estimator.cc (7074B)


      1 /*
      2 *  Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
      3 *
      4 *  Use of this source code is governed by a BSD-style license
      5 *  that can be found in the LICENSE file in the root of the source
      6 *  tree. An additional intellectual property rights grant can be found
      7 *  in the file PATENTS.  All contributing project authors may
      8 *  be found in the AUTHORS file in the root of the source tree.
      9 */
     10 
     11 #include "modules/audio_processing/agc2/speech_level_estimator.h"
     12 
     13 #include "api/audio/audio_processing.h"
     14 #include "modules/audio_processing/agc2/agc2_common.h"
     15 #include "modules/audio_processing/logging/apm_data_dumper.h"
     16 #include "rtc_base/checks.h"
     17 #include "rtc_base/numerics/safe_minmax.h"
     18 
     19 namespace webrtc {
     20 namespace {
     21 
     22 float ClampLevelEstimateDbfs(float level_estimate_dbfs) {
     23  return SafeClamp<float>(level_estimate_dbfs, -90.0f, 30.0f);
     24 }
     25 
     26 // Returns the initial speech level estimate needed to apply the initial gain.
     27 float GetInitialSpeechLevelEstimateDbfs(
     28    const AudioProcessing::Config::GainController2::AdaptiveDigital& config) {
     29  return ClampLevelEstimateDbfs(-kSaturationProtectorInitialHeadroomDb -
     30                                config.initial_gain_db - config.headroom_db);
     31 }
     32 
     33 }  // namespace
     34 
     35 bool SpeechLevelEstimator::LevelEstimatorState::operator==(
     36    const SpeechLevelEstimator::LevelEstimatorState& b) const {
     37  return time_to_confidence_ms == b.time_to_confidence_ms &&
     38         level_dbfs.numerator == b.level_dbfs.numerator &&
     39         level_dbfs.denominator == b.level_dbfs.denominator;
     40 }
     41 
     42 float SpeechLevelEstimator::LevelEstimatorState::Ratio::GetRatio() const {
     43  RTC_DCHECK_NE(denominator, 0.f);
     44  return numerator / denominator;
     45 }
     46 
     47 SpeechLevelEstimator::SpeechLevelEstimator(
     48    ApmDataDumper* apm_data_dumper,
     49    const AudioProcessing::Config::GainController2::AdaptiveDigital& config,
     50    int adjacent_speech_frames_threshold)
     51    : apm_data_dumper_(apm_data_dumper),
     52      initial_speech_level_dbfs_(GetInitialSpeechLevelEstimateDbfs(config)),
     53      adjacent_speech_frames_threshold_(adjacent_speech_frames_threshold),
     54      level_dbfs_(initial_speech_level_dbfs_),
     55      // TODO(bugs.webrtc.org/7494): Remove init below when AGC2 input volume
     56      // controller temporal dependency removed.
     57      is_confident_(false) {
     58  RTC_DCHECK(apm_data_dumper_);
     59  RTC_DCHECK_GE(adjacent_speech_frames_threshold_, 1);
     60  Reset();
     61 }
     62 
     63 void SpeechLevelEstimator::Update(float rms_dbfs,
     64                                  float peak_dbfs,
     65                                  float speech_probability) {
     66  RTC_DCHECK_GT(rms_dbfs, -150.0f);
     67  RTC_DCHECK_LT(rms_dbfs, 50.0f);
     68  RTC_DCHECK_GT(peak_dbfs, -150.0f);
     69  RTC_DCHECK_LT(peak_dbfs, 50.0f);
     70  RTC_DCHECK_GE(speech_probability, 0.0f);
     71  RTC_DCHECK_LE(speech_probability, 1.0f);
     72  if (speech_probability < kVadConfidenceThreshold) {
     73    // Not a speech frame.
     74    if (adjacent_speech_frames_threshold_ > 1) {
     75      // When two or more adjacent speech frames are required in order to update
     76      // the state, we need to decide whether to discard or confirm the updates
     77      // based on the speech sequence length.
     78      if (num_adjacent_speech_frames_ >= adjacent_speech_frames_threshold_) {
     79        // First non-speech frame after a long enough sequence of speech frames.
     80        // Update the reliable state.
     81        reliable_state_ = preliminary_state_;
     82      } else if (num_adjacent_speech_frames_ > 0) {
     83        // First non-speech frame after a too short sequence of speech frames.
     84        // Reset to the last reliable state.
     85        preliminary_state_ = reliable_state_;
     86      }
     87    }
     88    num_adjacent_speech_frames_ = 0;
     89  } else {
     90    // Speech frame observed.
     91    num_adjacent_speech_frames_++;
     92 
     93    // Update preliminary level estimate.
     94    RTC_DCHECK_GE(preliminary_state_.time_to_confidence_ms, 0);
     95    const bool buffer_is_full = preliminary_state_.time_to_confidence_ms == 0;
     96    if (!buffer_is_full) {
     97      preliminary_state_.time_to_confidence_ms -= kFrameDurationMs;
     98    }
     99    // Weighted average of levels with speech probability as weight.
    100    RTC_DCHECK_GT(speech_probability, 0.0f);
    101    const float leak_factor = buffer_is_full ? kLevelEstimatorLeakFactor : 1.0f;
    102    preliminary_state_.level_dbfs.numerator =
    103        preliminary_state_.level_dbfs.numerator * leak_factor +
    104        rms_dbfs * speech_probability;
    105    preliminary_state_.level_dbfs.denominator =
    106        preliminary_state_.level_dbfs.denominator * leak_factor +
    107        speech_probability;
    108 
    109    const float level_dbfs = preliminary_state_.level_dbfs.GetRatio();
    110 
    111    if (num_adjacent_speech_frames_ >= adjacent_speech_frames_threshold_) {
    112      // `preliminary_state_` is now reliable. Update the last level estimation.
    113      level_dbfs_ = ClampLevelEstimateDbfs(level_dbfs);
    114    }
    115  }
    116  UpdateIsConfident();
    117  DumpDebugData();
    118 }
    119 
    120 void SpeechLevelEstimator::UpdateIsConfident() {
    121  if (adjacent_speech_frames_threshold_ == 1) {
    122    // Ignore `reliable_state_` when a single frame is enough to update the
    123    // level estimate (because it is not used).
    124    is_confident_ = preliminary_state_.time_to_confidence_ms == 0;
    125    return;
    126  }
    127  // Once confident, it remains confident.
    128  RTC_DCHECK(reliable_state_.time_to_confidence_ms != 0 ||
    129             preliminary_state_.time_to_confidence_ms == 0);
    130  // During the first long enough speech sequence, `reliable_state_` must be
    131  // ignored since `preliminary_state_` is used.
    132  is_confident_ =
    133      reliable_state_.time_to_confidence_ms == 0 ||
    134      (num_adjacent_speech_frames_ >= adjacent_speech_frames_threshold_ &&
    135       preliminary_state_.time_to_confidence_ms == 0);
    136 }
    137 
    138 void SpeechLevelEstimator::Reset() {
    139  ResetLevelEstimatorState(preliminary_state_);
    140  ResetLevelEstimatorState(reliable_state_);
    141  level_dbfs_ = initial_speech_level_dbfs_;
    142  num_adjacent_speech_frames_ = 0;
    143 }
    144 
    145 void SpeechLevelEstimator::ResetLevelEstimatorState(
    146    LevelEstimatorState& state) const {
    147  state.time_to_confidence_ms = kLevelEstimatorTimeToConfidenceMs;
    148  state.level_dbfs.numerator = initial_speech_level_dbfs_;
    149  state.level_dbfs.denominator = 1.0f;
    150 }
    151 
    152 void SpeechLevelEstimator::DumpDebugData() const {
    153  if (!apm_data_dumper_)
    154    return;
    155  apm_data_dumper_->DumpRaw("agc2_speech_level_dbfs", level_dbfs_);
    156  apm_data_dumper_->DumpRaw("agc2_speech_level_is_confident", is_confident_);
    157  apm_data_dumper_->DumpRaw(
    158      "agc2_adaptive_level_estimator_num_adjacent_speech_frames",
    159      num_adjacent_speech_frames_);
    160  apm_data_dumper_->DumpRaw(
    161      "agc2_adaptive_level_estimator_preliminary_level_estimate_num",
    162      preliminary_state_.level_dbfs.numerator);
    163  apm_data_dumper_->DumpRaw(
    164      "agc2_adaptive_level_estimator_preliminary_level_estimate_den",
    165      preliminary_state_.level_dbfs.denominator);
    166  apm_data_dumper_->DumpRaw(
    167      "agc2_adaptive_level_estimator_preliminary_time_to_confidence_ms",
    168      preliminary_state_.time_to_confidence_ms);
    169  apm_data_dumper_->DumpRaw(
    170      "agc2_adaptive_level_estimator_reliable_time_to_confidence_ms",
    171      reliable_state_.time_to_confidence_ms);
    172 }
    173 
    174 }  // namespace webrtc