[ tor-browser ].git.dasho

energy_endpointer.cc (14011B)
      1 // Copyright (c) 2013 The Chromium Authors. All rights reserved.
      2 //
      3 // Redistribution and use in source and binary forms, with or without
      4 // modification, are permitted provided that the following conditions are
      5 // met:
      6 //
      7 //    * Redistributions of source code must retain the above copyright
      8 // notice, this list of conditions and the following disclaimer.
      9 //    * Redistributions in binary form must reproduce the above
     10 // copyright notice, this list of conditions and the following disclaimer
     11 // in the documentation and/or other materials provided with the
     12 // distribution.
     13 //    * Neither the name of Google Inc. nor the names of its
     14 // contributors may be used to endorse or promote products derived from
     15 // this software without specific prior written permission.
     16 //
     17 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     18 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     19 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     20 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     21 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     22 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     23 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     24 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     25 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     26 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     27 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     28 
     29 #include "energy_endpointer.h"
     30 
     31 #include <math.h>
     32 
     33 namespace {
     34 
     35 // Returns the RMS (quadratic mean) of the input signal.
     36 float RMS(const int16_t* samples, int num_samples) {
     37  int64_t ssq_int64_t = 0;
     38  int64_t sum_int64_t = 0;
     39  for (int i = 0; i < num_samples; ++i) {
     40    sum_int64_t += samples[i];
     41    ssq_int64_t += samples[i] * samples[i];
     42  }
     43  // now convert to floats.
     44  double sum = static_cast<double>(sum_int64_t);
     45  sum /= num_samples;
     46  double ssq = static_cast<double>(ssq_int64_t);
     47  return static_cast<float>(sqrt((ssq / num_samples) - (sum * sum)));
     48 }
     49 
     50 int64_t Secs2Usecs(float seconds) {
     51  return static_cast<int64_t>(0.5 + (1.0e6 * seconds));
     52 }
     53 
     54 float GetDecibel(float value) {
     55  if (value > 1.0e-100)
     56    return 20 * log10(value);
     57  return -2000.0;
     58 }
     59 
     60 }  // namespace
     61 
     62 namespace mozilla {
     63 
     64 // Stores threshold-crossing histories for making decisions about the speech
     65 // state.
     66 class EnergyEndpointer::HistoryRing {
     67 public:
     68  HistoryRing() : insertion_index_(0) {}
     69 
     70  // Resets the ring to |size| elements each with state |initial_state|
     71  void SetRing(int size, bool initial_state);
     72 
     73  // Inserts a new entry into the ring and drops the oldest entry.
     74  void Insert(int64_t time_us, bool decision);
     75 
     76  // Returns the time in microseconds of the most recently added entry.
     77  int64_t EndTime() const;
     78 
     79  // Returns the sum of all intervals during which 'decision' is true within
     80  // the time in seconds specified by 'duration'. The returned interval is
     81  // in seconds.
     82  float RingSum(float duration_sec);
     83 
     84 private:
     85  struct DecisionPoint {
     86    int64_t time_us;
     87    bool decision;
     88  };
     89 
     90  std::vector<DecisionPoint> decision_points_;
     91  int insertion_index_;  // Index at which the next item gets added/inserted.
     92 
     93  HistoryRing(const HistoryRing&);
     94  void operator=(const HistoryRing&);
     95 };
     96 
     97 void EnergyEndpointer::HistoryRing::SetRing(int size, bool initial_state) {
     98  insertion_index_ = 0;
     99  decision_points_.clear();
    100  DecisionPoint init = { -1, initial_state };
    101  decision_points_.resize(size, init);
    102 }
    103 
    104 void EnergyEndpointer::HistoryRing::Insert(int64_t time_us, bool decision) {
    105  decision_points_[insertion_index_].time_us = time_us;
    106  decision_points_[insertion_index_].decision = decision;
    107  insertion_index_ = (insertion_index_ + 1) % decision_points_.size();
    108 }
    109 
    110 int64_t EnergyEndpointer::HistoryRing::EndTime() const {
    111  int ind = insertion_index_ - 1;
    112  if (ind < 0)
    113    ind = decision_points_.size() - 1;
    114  return decision_points_[ind].time_us;
    115 }
    116 
    117 float EnergyEndpointer::HistoryRing::RingSum(float duration_sec) {
    118  if (decision_points_.empty())
    119    return 0.0;
    120 
    121  int64_t sum_us = 0;
    122  int ind = insertion_index_ - 1;
    123  if (ind < 0)
    124    ind = decision_points_.size() - 1;
    125  int64_t end_us = decision_points_[ind].time_us;
    126  bool is_on = decision_points_[ind].decision;
    127  int64_t start_us = end_us - static_cast<int64_t>(0.5 + (1.0e6 * duration_sec));
    128  if (start_us < 0)
    129    start_us = 0;
    130  size_t n_summed = 1;  // n points ==> (n-1) intervals
    131  while ((decision_points_[ind].time_us > start_us) &&
    132         (n_summed < decision_points_.size())) {
    133    --ind;
    134    if (ind < 0)
    135      ind = decision_points_.size() - 1;
    136    if (is_on)
    137      sum_us += end_us - decision_points_[ind].time_us;
    138    is_on = decision_points_[ind].decision;
    139    end_us = decision_points_[ind].time_us;
    140    n_summed++;
    141  }
    142 
    143  return 1.0e-6f * sum_us;  //  Returns total time that was super threshold.
    144 }
    145 
    146 EnergyEndpointer::EnergyEndpointer()
    147    : status_(EP_PRE_SPEECH),
    148      offset_confirm_dur_sec_(0),
    149      endpointer_time_us_(0),
    150      fast_update_frames_(0),
    151      frame_counter_(0),
    152      max_window_dur_(4.0),
    153      sample_rate_(0),
    154      history_(new HistoryRing()),
    155      decision_threshold_(0),
    156      estimating_environment_(false),
    157      noise_level_(0),
    158      rms_adapt_(0),
    159      start_lag_(0),
    160      end_lag_(0),
    161      user_input_start_time_us_(0) {
    162 }
    163 
    164 EnergyEndpointer::~EnergyEndpointer() {
    165 }
    166 
    167 int EnergyEndpointer::TimeToFrame(float time) const {
    168  return static_cast<int32_t>(0.5 + (time / params_.frame_period()));
    169 }
    170 
    171 void EnergyEndpointer::Restart(bool reset_threshold) {
    172  status_ = EP_PRE_SPEECH;
    173  user_input_start_time_us_ = 0;
    174 
    175  if (reset_threshold) {
    176    decision_threshold_ = params_.decision_threshold();
    177    rms_adapt_ = decision_threshold_;
    178    noise_level_ = params_.decision_threshold() / 2.0f;
    179    frame_counter_ = 0;  // Used for rapid initial update of levels.
    180  }
    181 
    182  // Set up the memories to hold the history windows.
    183  history_->SetRing(TimeToFrame(max_window_dur_), false);
    184 
    185  // Flag that indicates that current input should be used for
    186  // estimating the environment. The user has not yet started input
    187  // by e.g. pressed the push-to-talk button. By default, this is
    188  // false for backward compatibility.
    189  estimating_environment_ = false;
    190 }
    191 
    192 void EnergyEndpointer::Init(const EnergyEndpointerParams& params) {
    193  params_ = params;
    194 
    195  // Find the longest history interval to be used, and make the ring
    196  // large enough to accommodate that number of frames.  NOTE: This
    197  // depends upon ep_frame_period being set correctly in the factory
    198  // that did this instantiation.
    199  max_window_dur_ = params_.onset_window();
    200  if (params_.speech_on_window() > max_window_dur_)
    201    max_window_dur_ = params_.speech_on_window();
    202  if (params_.offset_window() > max_window_dur_)
    203    max_window_dur_ = params_.offset_window();
    204  Restart(true);
    205 
    206  offset_confirm_dur_sec_ = params_.offset_window() -
    207                            params_.offset_confirm_dur();
    208  if (offset_confirm_dur_sec_ < 0.0)
    209    offset_confirm_dur_sec_ = 0.0;
    210 
    211  user_input_start_time_us_ = 0;
    212 
    213  // Flag that indicates that  current input should be used for
    214  // estimating the environment. The user has not yet started input
    215  // by e.g. pressed the push-to-talk button. By default, this is
    216  // false for backward compatibility.
    217  estimating_environment_ = false;
    218  // The initial value of the noise and speech levels is inconsequential.
    219  // The level of the first frame will overwrite these values.
    220  noise_level_ = params_.decision_threshold() / 2.0f;
    221  fast_update_frames_ =
    222      static_cast<int64_t>(params_.fast_update_dur() / params_.frame_period());
    223 
    224  frame_counter_ = 0;  // Used for rapid initial update of levels.
    225 
    226  sample_rate_ = params_.sample_rate();
    227  start_lag_ = static_cast<int>(sample_rate_ /
    228                                params_.max_fundamental_frequency());
    229  end_lag_ = static_cast<int>(sample_rate_ /
    230                              params_.min_fundamental_frequency());
    231 }
    232 
    233 void EnergyEndpointer::StartSession() {
    234  Restart(true);
    235 }
    236 
    237 void EnergyEndpointer::EndSession() {
    238  status_ = EP_POST_SPEECH;
    239 }
    240 
    241 void EnergyEndpointer::SetEnvironmentEstimationMode() {
    242  Restart(true);
    243  estimating_environment_ = true;
    244 }
    245 
    246 void EnergyEndpointer::SetUserInputMode() {
    247  estimating_environment_ = false;
    248  user_input_start_time_us_ = endpointer_time_us_;
    249 }
    250 
    251 void EnergyEndpointer::ProcessAudioFrame(int64_t time_us,
    252                                         const int16_t* samples,
    253                                         int num_samples,
    254                                         float* rms_out) {
    255  endpointer_time_us_ = time_us;
    256  float rms = RMS(samples, num_samples);
    257 
    258  // Check that this is user input audio vs. pre-input adaptation audio.
    259  // Input audio starts when the user indicates start of input, by e.g.
    260  // pressing push-to-talk. Audio recieved prior to that is used to update
    261  // noise and speech level estimates.
    262  if (!estimating_environment_) {
    263    bool decision = false;
    264    if ((endpointer_time_us_ - user_input_start_time_us_) <
    265        Secs2Usecs(params_.contamination_rejection_period())) {
    266      decision = false;
    267      //PR_LOG(GetSpeechRecognitionLog(), PR_LOG_DEBUG, ("decision: forced to false, time: %d", endpointer_time_us_));
    268    } else {
    269      decision = (rms > decision_threshold_);
    270    }
    271 
    272    history_->Insert(endpointer_time_us_, decision);
    273 
    274    switch (status_) {
    275      case EP_PRE_SPEECH:
    276        if (history_->RingSum(params_.onset_window()) >
    277            params_.onset_detect_dur()) {
    278          status_ = EP_POSSIBLE_ONSET;
    279        }
    280        break;
    281 
    282      case EP_POSSIBLE_ONSET: {
    283        float tsum = history_->RingSum(params_.onset_window());
    284        if (tsum > params_.onset_confirm_dur()) {
    285          status_ = EP_SPEECH_PRESENT;
    286        } else {  // If signal is not maintained, drop back to pre-speech.
    287          if (tsum <= params_.onset_detect_dur())
    288            status_ = EP_PRE_SPEECH;
    289        }
    290        break;
    291      }
    292 
    293      case EP_SPEECH_PRESENT: {
    294        // To induce hysteresis in the state residency, we allow a
    295        // smaller residency time in the on_ring, than was required to
    296        // enter the SPEECH_PERSENT state.
    297        float on_time = history_->RingSum(params_.speech_on_window());
    298        if (on_time < params_.on_maintain_dur())
    299          status_ = EP_POSSIBLE_OFFSET;
    300        break;
    301      }
    302 
    303      case EP_POSSIBLE_OFFSET:
    304        if (history_->RingSum(params_.offset_window()) <=
    305            offset_confirm_dur_sec_) {
    306          // Note that this offset time may be beyond the end
    307          // of the input buffer in a real-time system.  It will be up
    308          // to the RecognizerSession to decide what to do.
    309          status_ = EP_PRE_SPEECH;  // Automatically reset for next utterance.
    310        } else {  // If speech picks up again we allow return to SPEECH_PRESENT.
    311          if (history_->RingSum(params_.speech_on_window()) >=
    312              params_.on_maintain_dur())
    313            status_ = EP_SPEECH_PRESENT;
    314        }
    315        break;
    316 
    317      default:
    318        break;
    319    }
    320 
    321    // If this is a quiet, non-speech region, slowly adapt the detection
    322    // threshold to be about 6dB above the average RMS.
    323    if ((!decision) && (status_ == EP_PRE_SPEECH)) {
    324      decision_threshold_ = (0.98f * decision_threshold_) + (0.02f * 2 * rms);
    325      rms_adapt_ = decision_threshold_;
    326    } else {
    327      // If this is in a speech region, adapt the decision threshold to
    328      // be about 10dB below the average RMS. If the noise level is high,
    329      // the threshold is pushed up.
    330      // Adaptation up to a higher level is 5 times faster than decay to
    331      // a lower level.
    332      if ((status_ == EP_SPEECH_PRESENT) && decision) {
    333        if (rms_adapt_ > rms) {
    334          rms_adapt_ = (0.99f * rms_adapt_) + (0.01f * rms);
    335        } else {
    336          rms_adapt_ = (0.95f * rms_adapt_) + (0.05f * rms);
    337        }
    338        float target_threshold = 0.3f * rms_adapt_ +  noise_level_;
    339        decision_threshold_ = (.90f * decision_threshold_) +
    340                              (0.10f * target_threshold);
    341      }
    342    }
    343 
    344    // Set a floor
    345    if (decision_threshold_ < params_.min_decision_threshold())
    346      decision_threshold_ = params_.min_decision_threshold();
    347  }
    348 
    349  // Update speech and noise levels.
    350  UpdateLevels(rms);
    351  ++frame_counter_;
    352 
    353  if (rms_out)
    354    *rms_out = GetDecibel(rms);
    355 }
    356 
    357 float EnergyEndpointer::GetNoiseLevelDb() const {
    358  return GetDecibel(noise_level_);
    359 }
    360 
    361 void EnergyEndpointer::UpdateLevels(float rms) {
    362  // Update quickly initially. We assume this is noise and that
    363  // speech is 6dB above the noise.
    364  if (frame_counter_ < fast_update_frames_) {
    365    // Alpha increases from 0 to (k-1)/k where k is the number of time
    366    // steps in the initial adaptation period.
    367    float alpha = static_cast<float>(frame_counter_) /
    368        static_cast<float>(fast_update_frames_);
    369    noise_level_ = (alpha * noise_level_) + ((1 - alpha) * rms);
    370    //PR_LOG(GetSpeechRecognitionLog(), PR_LOG_DEBUG, ("FAST UPDATE, frame_counter_ %d, fast_update_frames_ %d", frame_counter_, fast_update_frames_));
    371  } else {
    372    // Update Noise level. The noise level adapts quickly downward, but
    373    // slowly upward. The noise_level_ parameter is not currently used
    374    // for threshold adaptation. It is used for UI feedback.
    375    if (noise_level_ < rms)
    376      noise_level_ = (0.999f * noise_level_) + (0.001f * rms);
    377    else
    378      noise_level_ = (0.95f * noise_level_) + (0.05f * rms);
    379  }
    380  if (estimating_environment_ || (frame_counter_ < fast_update_frames_)) {
    381    decision_threshold_ = noise_level_ * 2; // 6dB above noise level.
    382    // Set a floor
    383    if (decision_threshold_ < params_.min_decision_threshold())
    384      decision_threshold_ = params_.min_decision_threshold();
    385  }
    386 }
    387 
    388 EpStatus EnergyEndpointer::Status(int64_t* status_time)  const {
    389  *status_time = history_->EndTime();
    390  return status_;
    391 }
    392 
    393 }  // namespace mozilla
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE