endpointer.cc (7525B)
1 // Copyright (c) 2013 The Chromium Authors. All rights reserved. 2 // 3 // Redistribution and use in source and binary forms, with or without 4 // modification, are permitted provided that the following conditions are 5 // met: 6 // 7 // * Redistributions of source code must retain the above copyright 8 // notice, this list of conditions and the following disclaimer. 9 // * Redistributions in binary form must reproduce the above 10 // copyright notice, this list of conditions and the following disclaimer 11 // in the documentation and/or other materials provided with the 12 // distribution. 13 // * Neither the name of Google Inc. nor the names of its 14 // contributors may be used to endorse or promote products derived from 15 // this software without specific prior written permission. 16 // 17 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 29 #include "endpointer.h" 30 31 #include "AudioSegment.h" 32 33 namespace { 34 const int kFrameRate = 200; // 1 frame = 5ms of audio. 35 } 36 37 namespace mozilla { 38 39 Endpointer::Endpointer(int sample_rate) 40 : speech_input_possibly_complete_silence_length_us_(-1), 41 speech_input_complete_silence_length_us_(-1), 42 audio_frame_time_us_(0), 43 sample_rate_(sample_rate), 44 frame_size_(0) { 45 Reset(); 46 47 frame_size_ = static_cast<int>(sample_rate / static_cast<float>(kFrameRate)); 48 49 speech_input_minimum_length_us_ = 50 static_cast<int64_t>(1.7 * 1000000); 51 speech_input_complete_silence_length_us_ = 52 static_cast<int64_t>(0.5 * 1000000); 53 long_speech_input_complete_silence_length_us_ = -1; 54 long_speech_length_us_ = -1; 55 speech_input_possibly_complete_silence_length_us_ = 56 1 * 1000000; 57 58 // Set the default configuration for Push To Talk mode. 59 EnergyEndpointerParams ep_config; 60 ep_config.set_frame_period(1.0f / static_cast<float>(kFrameRate)); 61 ep_config.set_frame_duration(1.0f / static_cast<float>(kFrameRate)); 62 ep_config.set_endpoint_margin(0.2f); 63 ep_config.set_onset_window(0.15f); 64 ep_config.set_speech_on_window(0.4f); 65 ep_config.set_offset_window(0.15f); 66 ep_config.set_onset_detect_dur(0.09f); 67 ep_config.set_onset_confirm_dur(0.075f); 68 ep_config.set_on_maintain_dur(0.10f); 69 ep_config.set_offset_confirm_dur(0.12f); 70 ep_config.set_decision_threshold(1000.0f); 71 ep_config.set_min_decision_threshold(50.0f); 72 ep_config.set_fast_update_dur(0.2f); 73 ep_config.set_sample_rate(static_cast<float>(sample_rate)); 74 ep_config.set_min_fundamental_frequency(57.143f); 75 ep_config.set_max_fundamental_frequency(400.0f); 76 ep_config.set_contamination_rejection_period(0.25f); 77 energy_endpointer_.Init(ep_config); 78 } 79 80 void Endpointer::Reset() { 81 old_ep_status_ = EP_PRE_SPEECH; 82 waiting_for_speech_possibly_complete_timeout_ = false; 83 waiting_for_speech_complete_timeout_ = false; 84 speech_previously_detected_ = false; 85 speech_input_complete_ = false; 86 audio_frame_time_us_ = 0; // Reset time for packets sent to endpointer. 87 speech_end_time_us_ = -1; 88 speech_start_time_us_ = -1; 89 } 90 91 void Endpointer::StartSession() { 92 Reset(); 93 energy_endpointer_.StartSession(); 94 } 95 96 void Endpointer::EndSession() { 97 energy_endpointer_.EndSession(); 98 } 99 100 void Endpointer::SetEnvironmentEstimationMode() { 101 Reset(); 102 energy_endpointer_.SetEnvironmentEstimationMode(); 103 } 104 105 void Endpointer::SetUserInputMode() { 106 energy_endpointer_.SetUserInputMode(); 107 } 108 109 EpStatus Endpointer::Status(int64_t *time) { 110 return energy_endpointer_.Status(time); 111 } 112 113 EpStatus Endpointer::ProcessAudio(const AudioChunk& raw_audio, float* rms_out) { 114 MOZ_ASSERT(raw_audio.mBufferFormat == AUDIO_FORMAT_S16, "Audio is not in 16 bit format"); 115 const int16_t* audio_data = static_cast<const int16_t*>(raw_audio.mChannelData[0]); 116 const int num_samples = raw_audio.mDuration; 117 EpStatus ep_status = EP_PRE_SPEECH; 118 119 // Process the input data in blocks of frame_size_, dropping any incomplete 120 // frames at the end (which is ok since typically the caller will be recording 121 // audio in multiples of our frame size). 122 int sample_index = 0; 123 while (sample_index + frame_size_ <= num_samples) { 124 // Have the endpointer process the frame. 125 energy_endpointer_.ProcessAudioFrame(audio_frame_time_us_, 126 audio_data + sample_index, 127 frame_size_, 128 rms_out); 129 sample_index += frame_size_; 130 audio_frame_time_us_ += (frame_size_ * 1000000) / 131 sample_rate_; 132 133 // Get the status of the endpointer. 134 int64_t ep_time; 135 ep_status = energy_endpointer_.Status(&ep_time); 136 if (old_ep_status_ != ep_status) 137 fprintf(stderr, "Status changed old= %d, new= %d\n", old_ep_status_, ep_status); 138 139 // Handle state changes. 140 if ((EP_SPEECH_PRESENT == ep_status) && 141 (EP_POSSIBLE_ONSET == old_ep_status_)) { 142 speech_end_time_us_ = -1; 143 waiting_for_speech_possibly_complete_timeout_ = false; 144 waiting_for_speech_complete_timeout_ = false; 145 // Trigger SpeechInputDidStart event on first detection. 146 if (false == speech_previously_detected_) { 147 speech_previously_detected_ = true; 148 speech_start_time_us_ = ep_time; 149 } 150 } 151 if ((EP_PRE_SPEECH == ep_status) && 152 (EP_POSSIBLE_OFFSET == old_ep_status_)) { 153 speech_end_time_us_ = ep_time; 154 waiting_for_speech_possibly_complete_timeout_ = true; 155 waiting_for_speech_complete_timeout_ = true; 156 } 157 if (ep_time > speech_input_minimum_length_us_) { 158 // Speech possibly complete timeout. 159 if ((waiting_for_speech_possibly_complete_timeout_) && 160 (ep_time - speech_end_time_us_ > 161 speech_input_possibly_complete_silence_length_us_)) { 162 waiting_for_speech_possibly_complete_timeout_ = false; 163 } 164 if (waiting_for_speech_complete_timeout_) { 165 // The length of the silence timeout period can be held constant, or it 166 // can be changed after a fixed amount of time from the beginning of 167 // speech. 168 bool has_stepped_silence = 169 (long_speech_length_us_ > 0) && 170 (long_speech_input_complete_silence_length_us_ > 0); 171 int64_t requested_silence_length; 172 if (has_stepped_silence && 173 (ep_time - speech_start_time_us_) > long_speech_length_us_) { 174 requested_silence_length = 175 long_speech_input_complete_silence_length_us_; 176 } else { 177 requested_silence_length = 178 speech_input_complete_silence_length_us_; 179 } 180 181 // Speech complete timeout. 182 if ((ep_time - speech_end_time_us_) > requested_silence_length) { 183 waiting_for_speech_complete_timeout_ = false; 184 speech_input_complete_ = true; 185 } 186 } 187 } 188 old_ep_status_ = ep_status; 189 } 190 return ep_status; 191 } 192 193 } // namespace mozilla