tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

energy_endpointer.h (7226B)


      1 // Copyright (c) 2013 The Chromium Authors. All rights reserved.
      2 //
      3 // Redistribution and use in source and binary forms, with or without
      4 // modification, are permitted provided that the following conditions are
      5 // met:
      6 //
      7 //    * Redistributions of source code must retain the above copyright
      8 // notice, this list of conditions and the following disclaimer.
      9 //    * Redistributions in binary form must reproduce the above
     10 // copyright notice, this list of conditions and the following disclaimer
     11 // in the documentation and/or other materials provided with the
     12 // distribution.
     13 //    * Neither the name of Google Inc. nor the names of its
     14 // contributors may be used to endorse or promote products derived from
     15 // this software without specific prior written permission.
     16 //
     17 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     18 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     19 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     20 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     21 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     22 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     23 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     24 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     25 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     26 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     27 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     28 
     29 // The EnergyEndpointer class finds likely speech onset and offset points.
     30 //
     31 // The implementation described here is about the simplest possible.
     32 // It is based on timings of threshold crossings for overall signal
     33 // RMS. It is suitable for light weight applications.
     34 //
     35 // As written, the basic idea is that one specifies intervals that
     36 // must be occupied by super- and sub-threshold energy levels, and
     37 // defers decisions re onset and offset times until these
     38 // specifications have been met.  Three basic intervals are tested: an
     39 // onset window, a speech-on window, and an offset window.  We require
     40 // super-threshold to exceed some mimimum total durations in the onset
     41 // and speech-on windows before declaring the speech onset time, and
     42 // we specify a required sub-threshold residency in the offset window
     43 // before declaring speech offset. As the various residency requirements are
     44 // met, the EnergyEndpointer instance assumes various states, and can return the
     45 // ID of these states to the client (see EpStatus below).
     46 //
     47 // The levels of the speech and background noise are continuously updated. It is
     48 // important that the background noise level be estimated initially for
     49 // robustness in noisy conditions. The first frames are assumed to be background
     50 // noise and a fast update rate is used for the noise level. The duration for
     51 // fast update is controlled by the fast_update_dur_ paramter.
     52 //
     53 // If used in noisy conditions, the endpointer should be started and run in the
     54 // EnvironmentEstimation mode, for at least 200ms, before switching to
     55 // UserInputMode.
     56 // Audio feedback contamination can appear in the input audio, if not cut
     57 // out or handled by echo cancellation. Audio feedback can trigger a false
     58 // accept. The false accepts can be ignored by setting
     59 // ep_contamination_rejection_period.
     60 
     61 #ifndef CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_
     62 #define CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_
     63 
     64 #include <vector>
     65 
     66 #include "mozilla/UniquePtr.h"
     67 
     68 #include "energy_endpointer_params.h"
     69 
     70 namespace mozilla {
     71 
     72 // Endpointer status codes
     73 enum EpStatus {
     74  EP_PRE_SPEECH = 10,
     75  EP_POSSIBLE_ONSET,
     76  EP_SPEECH_PRESENT,
     77  EP_POSSIBLE_OFFSET,
     78  EP_POST_SPEECH,
     79 };
     80 
     81 class EnergyEndpointer {
     82 public:
     83  // The default construction MUST be followed by Init(), before any
     84  // other use can be made of the instance.
     85  EnergyEndpointer();
     86  virtual ~EnergyEndpointer();
     87 
     88  void Init(const EnergyEndpointerParams& params);
     89 
     90  // Start the endpointer. This should be called at the beginning of a session.
     91  void StartSession();
     92 
     93  // Stop the endpointer.
     94  void EndSession();
     95 
     96  // Start environment estimation. Audio will be used for environment estimation
     97  // i.e. noise level estimation.
     98  void SetEnvironmentEstimationMode();
     99 
    100  // Start user input. This should be called when the user indicates start of
    101  // input, e.g. by pressing a button.
    102  void SetUserInputMode();
    103 
    104  // Computes the next input frame and modifies EnergyEndpointer status as
    105  // appropriate based on the computation.
    106  void ProcessAudioFrame(int64_t time_us,
    107                         const int16_t* samples, int num_samples,
    108                         float* rms_out);
    109 
    110  // Returns the current state of the EnergyEndpointer and the time
    111  // corresponding to the most recently computed frame.
    112  EpStatus Status(int64_t* status_time_us) const;
    113 
    114  bool estimating_environment() const {
    115    return estimating_environment_;
    116  }
    117 
    118  // Returns estimated noise level in dB.
    119  float GetNoiseLevelDb() const;
    120 
    121 private:
    122  class HistoryRing;
    123 
    124  // Resets the endpointer internal state.  If reset_threshold is true, the
    125  // state will be reset completely, including adaptive thresholds and the
    126  // removal of all history information.
    127  void Restart(bool reset_threshold);
    128 
    129  // Update internal speech and noise levels.
    130  void UpdateLevels(float rms);
    131 
    132  // Returns the number of frames (or frame number) corresponding to
    133  // the 'time' (in seconds).
    134  int TimeToFrame(float time) const;
    135 
    136  EpStatus status_;  // The current state of this instance.
    137  float offset_confirm_dur_sec_;  // max on time allowed to confirm POST_SPEECH
    138  int64_t endpointer_time_us_;  // Time of the most recently received audio frame.
    139  int64_t fast_update_frames_; // Number of frames for initial level adaptation.
    140  int64_t frame_counter_;  // Number of frames seen. Used for initial adaptation.
    141  float max_window_dur_;  // Largest search window size (seconds)
    142  float sample_rate_;  // Sampling rate.
    143 
    144  // Ring buffers to hold the speech activity history.
    145  UniquePtr<HistoryRing> history_;
    146 
    147  // Configuration parameters.
    148  EnergyEndpointerParams params_;
    149 
    150  // RMS which must be exceeded to conclude frame is speech.
    151  float decision_threshold_;
    152 
    153  // Flag to indicate that audio should be used to estimate environment, prior
    154  // to receiving user input.
    155  bool estimating_environment_;
    156 
    157  // Estimate of the background noise level. Used externally for UI feedback.
    158  float noise_level_;
    159 
    160  // An adaptive threshold used to update decision_threshold_ when appropriate.
    161  float rms_adapt_;
    162 
    163  // Start lag corresponds to the highest fundamental frequency.
    164  int start_lag_;
    165 
    166  // End lag corresponds to the lowest fundamental frequency.
    167  int end_lag_;
    168 
    169  // Time when mode switched from environment estimation to user input. This
    170  // is used to time forced rejection of audio feedback contamination.
    171  int64_t user_input_start_time_us_;
    172 
    173  // prevent copy constructor and assignment
    174  EnergyEndpointer(const EnergyEndpointer&);
    175  void operator=(const EnergyEndpointer&);
    176 };
    177 
    178 }  // namespace mozilla
    179 
    180 #endif  // CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_