endpointer.h (7085B)
1 // Copyright (c) 2013 The Chromium Authors. All rights reserved. 2 // 3 // Redistribution and use in source and binary forms, with or without 4 // modification, are permitted provided that the following conditions are 5 // met: 6 // 7 // * Redistributions of source code must retain the above copyright 8 // notice, this list of conditions and the following disclaimer. 9 // * Redistributions in binary form must reproduce the above 10 // copyright notice, this list of conditions and the following disclaimer 11 // in the documentation and/or other materials provided with the 12 // distribution. 13 // * Neither the name of Google Inc. nor the names of its 14 // contributors may be used to endorse or promote products derived from 15 // this software without specific prior written permission. 16 // 17 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 29 #ifndef CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_ 30 #define CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_ 31 32 #include "energy_endpointer.h" 33 34 namespace mozilla { 35 36 struct AudioChunk; 37 38 // A simple interface to the underlying energy-endpointer implementation, this 39 // class lets callers provide audio as being recorded and let them poll to find 40 // when the user has stopped speaking. 41 // 42 // There are two events that may trigger the end of speech: 43 // 44 // speechInputPossiblyComplete event: 45 // 46 // Signals that silence/noise has been detected for a *short* amount of 47 // time after some speech has been detected. It can be used for low latency 48 // UI feedback. To disable it, set it to a large amount. 49 // 50 // speechInputComplete event: 51 // 52 // This event is intended to signal end of input and to stop recording. 53 // The amount of time to wait after speech is set by 54 // speech_input_complete_silence_length_ and optionally two other 55 // parameters (see below). 56 // This time can be held constant, or can change as more speech is detected. 57 // In the latter case, the time changes after a set amount of time from the 58 // *beginning* of speech. This is motivated by the expectation that there 59 // will be two distinct types of inputs: short search queries and longer 60 // dictation style input. 61 // 62 // Three parameters are used to define the piecewise constant timeout function. 63 // The timeout length is speech_input_complete_silence_length until 64 // long_speech_length, when it changes to 65 // long_speech_input_complete_silence_length. 66 class Endpointer { 67 public: 68 explicit Endpointer(int sample_rate); 69 70 // Start the endpointer. This should be called at the beginning of a session. 71 void StartSession(); 72 73 // Stop the endpointer. 74 void EndSession(); 75 76 // Start environment estimation. Audio will be used for environment estimation 77 // i.e. noise level estimation. 78 void SetEnvironmentEstimationMode(); 79 80 // Start user input. This should be called when the user indicates start of 81 // input, e.g. by pressing a button. 82 void SetUserInputMode(); 83 84 // Process a segment of audio, which may be more than one frame. 85 // The status of the last frame will be returned. 86 EpStatus ProcessAudio(const AudioChunk& raw_audio, float* rms_out); 87 88 // Get the status of the endpointer. 89 EpStatus Status(int64_t *time_us); 90 91 // Get the expected frame size for audio chunks. Audio chunks are expected 92 // to contain a number of samples that is a multiple of this number, and extra 93 // samples will be dropped. 94 int32_t FrameSize() const { 95 return frame_size_; 96 } 97 98 // Returns true if the endpointer detected reasonable audio levels above 99 // background noise which could be user speech, false if not. 100 bool DidStartReceivingSpeech() const { 101 return speech_previously_detected_; 102 } 103 104 bool IsEstimatingEnvironment() const { 105 return energy_endpointer_.estimating_environment(); 106 } 107 108 void set_speech_input_complete_silence_length(int64_t time_us) { 109 speech_input_complete_silence_length_us_ = time_us; 110 } 111 112 void set_long_speech_input_complete_silence_length(int64_t time_us) { 113 long_speech_input_complete_silence_length_us_ = time_us; 114 } 115 116 void set_speech_input_possibly_complete_silence_length(int64_t time_us) { 117 speech_input_possibly_complete_silence_length_us_ = time_us; 118 } 119 120 void set_long_speech_length(int64_t time_us) { 121 long_speech_length_us_ = time_us; 122 } 123 124 bool speech_input_complete() const { 125 return speech_input_complete_; 126 } 127 128 // RMS background noise level in dB. 129 float NoiseLevelDb() const { return energy_endpointer_.GetNoiseLevelDb(); } 130 131 private: 132 // Reset internal states. Helper method common to initial input utterance 133 // and following input utternaces. 134 void Reset(); 135 136 // Minimum allowable length of speech input. 137 int64_t speech_input_minimum_length_us_; 138 139 // The speechInputPossiblyComplete event signals that silence/noise has been 140 // detected for a *short* amount of time after some speech has been detected. 141 // This proporty specifies the time period. 142 int64_t speech_input_possibly_complete_silence_length_us_; 143 144 // The speechInputComplete event signals that silence/noise has been 145 // detected for a *long* amount of time after some speech has been detected. 146 // This property specifies the time period. 147 int64_t speech_input_complete_silence_length_us_; 148 149 // Same as above, this specifies the required silence period after speech 150 // detection. This period is used instead of 151 // speech_input_complete_silence_length_ when the utterance is longer than 152 // long_speech_length_. This parameter is optional. 153 int64_t long_speech_input_complete_silence_length_us_; 154 155 // The period of time after which the endpointer should consider 156 // long_speech_input_complete_silence_length_ as a valid silence period 157 // instead of speech_input_complete_silence_length_. This parameter is 158 // optional. 159 int64_t long_speech_length_us_; 160 161 // First speech onset time, used in determination of speech complete timeout. 162 int64_t speech_start_time_us_; 163 164 // Most recent end time, used in determination of speech complete timeout. 165 int64_t speech_end_time_us_; 166 167 int64_t audio_frame_time_us_; 168 EpStatus old_ep_status_; 169 bool waiting_for_speech_possibly_complete_timeout_; 170 bool waiting_for_speech_complete_timeout_; 171 bool speech_previously_detected_; 172 bool speech_input_complete_; 173 EnergyEndpointer energy_endpointer_; 174 int sample_rate_; 175 int32_t frame_size_; 176 }; 177 178 } // namespace mozilla 179 180 #endif // CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_