energy_endpointer.h (7226B)
1 // Copyright (c) 2013 The Chromium Authors. All rights reserved. 2 // 3 // Redistribution and use in source and binary forms, with or without 4 // modification, are permitted provided that the following conditions are 5 // met: 6 // 7 // * Redistributions of source code must retain the above copyright 8 // notice, this list of conditions and the following disclaimer. 9 // * Redistributions in binary form must reproduce the above 10 // copyright notice, this list of conditions and the following disclaimer 11 // in the documentation and/or other materials provided with the 12 // distribution. 13 // * Neither the name of Google Inc. nor the names of its 14 // contributors may be used to endorse or promote products derived from 15 // this software without specific prior written permission. 16 // 17 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 29 // The EnergyEndpointer class finds likely speech onset and offset points. 30 // 31 // The implementation described here is about the simplest possible. 32 // It is based on timings of threshold crossings for overall signal 33 // RMS. It is suitable for light weight applications. 34 // 35 // As written, the basic idea is that one specifies intervals that 36 // must be occupied by super- and sub-threshold energy levels, and 37 // defers decisions re onset and offset times until these 38 // specifications have been met. Three basic intervals are tested: an 39 // onset window, a speech-on window, and an offset window. We require 40 // super-threshold to exceed some mimimum total durations in the onset 41 // and speech-on windows before declaring the speech onset time, and 42 // we specify a required sub-threshold residency in the offset window 43 // before declaring speech offset. As the various residency requirements are 44 // met, the EnergyEndpointer instance assumes various states, and can return the 45 // ID of these states to the client (see EpStatus below). 46 // 47 // The levels of the speech and background noise are continuously updated. It is 48 // important that the background noise level be estimated initially for 49 // robustness in noisy conditions. The first frames are assumed to be background 50 // noise and a fast update rate is used for the noise level. The duration for 51 // fast update is controlled by the fast_update_dur_ paramter. 52 // 53 // If used in noisy conditions, the endpointer should be started and run in the 54 // EnvironmentEstimation mode, for at least 200ms, before switching to 55 // UserInputMode. 56 // Audio feedback contamination can appear in the input audio, if not cut 57 // out or handled by echo cancellation. Audio feedback can trigger a false 58 // accept. The false accepts can be ignored by setting 59 // ep_contamination_rejection_period. 60 61 #ifndef CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_ 62 #define CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_ 63 64 #include <vector> 65 66 #include "mozilla/UniquePtr.h" 67 68 #include "energy_endpointer_params.h" 69 70 namespace mozilla { 71 72 // Endpointer status codes 73 enum EpStatus { 74 EP_PRE_SPEECH = 10, 75 EP_POSSIBLE_ONSET, 76 EP_SPEECH_PRESENT, 77 EP_POSSIBLE_OFFSET, 78 EP_POST_SPEECH, 79 }; 80 81 class EnergyEndpointer { 82 public: 83 // The default construction MUST be followed by Init(), before any 84 // other use can be made of the instance. 85 EnergyEndpointer(); 86 virtual ~EnergyEndpointer(); 87 88 void Init(const EnergyEndpointerParams& params); 89 90 // Start the endpointer. This should be called at the beginning of a session. 91 void StartSession(); 92 93 // Stop the endpointer. 94 void EndSession(); 95 96 // Start environment estimation. Audio will be used for environment estimation 97 // i.e. noise level estimation. 98 void SetEnvironmentEstimationMode(); 99 100 // Start user input. This should be called when the user indicates start of 101 // input, e.g. by pressing a button. 102 void SetUserInputMode(); 103 104 // Computes the next input frame and modifies EnergyEndpointer status as 105 // appropriate based on the computation. 106 void ProcessAudioFrame(int64_t time_us, 107 const int16_t* samples, int num_samples, 108 float* rms_out); 109 110 // Returns the current state of the EnergyEndpointer and the time 111 // corresponding to the most recently computed frame. 112 EpStatus Status(int64_t* status_time_us) const; 113 114 bool estimating_environment() const { 115 return estimating_environment_; 116 } 117 118 // Returns estimated noise level in dB. 119 float GetNoiseLevelDb() const; 120 121 private: 122 class HistoryRing; 123 124 // Resets the endpointer internal state. If reset_threshold is true, the 125 // state will be reset completely, including adaptive thresholds and the 126 // removal of all history information. 127 void Restart(bool reset_threshold); 128 129 // Update internal speech and noise levels. 130 void UpdateLevels(float rms); 131 132 // Returns the number of frames (or frame number) corresponding to 133 // the 'time' (in seconds). 134 int TimeToFrame(float time) const; 135 136 EpStatus status_; // The current state of this instance. 137 float offset_confirm_dur_sec_; // max on time allowed to confirm POST_SPEECH 138 int64_t endpointer_time_us_; // Time of the most recently received audio frame. 139 int64_t fast_update_frames_; // Number of frames for initial level adaptation. 140 int64_t frame_counter_; // Number of frames seen. Used for initial adaptation. 141 float max_window_dur_; // Largest search window size (seconds) 142 float sample_rate_; // Sampling rate. 143 144 // Ring buffers to hold the speech activity history. 145 UniquePtr<HistoryRing> history_; 146 147 // Configuration parameters. 148 EnergyEndpointerParams params_; 149 150 // RMS which must be exceeded to conclude frame is speech. 151 float decision_threshold_; 152 153 // Flag to indicate that audio should be used to estimate environment, prior 154 // to receiving user input. 155 bool estimating_environment_; 156 157 // Estimate of the background noise level. Used externally for UI feedback. 158 float noise_level_; 159 160 // An adaptive threshold used to update decision_threshold_ when appropriate. 161 float rms_adapt_; 162 163 // Start lag corresponds to the highest fundamental frequency. 164 int start_lag_; 165 166 // End lag corresponds to the lowest fundamental frequency. 167 int end_lag_; 168 169 // Time when mode switched from environment estimation to user input. This 170 // is used to time forced rejection of audio feedback contamination. 171 int64_t user_input_start_time_us_; 172 173 // prevent copy constructor and assignment 174 EnergyEndpointer(const EnergyEndpointer&); 175 void operator=(const EnergyEndpointer&); 176 }; 177 178 } // namespace mozilla 179 180 #endif // CONTENT_BROWSER_SPEECH_ENDPOINTER_ENERGY_ENDPOINTER_H_