energy_endpointer.cc (14011B)
1 // Copyright (c) 2013 The Chromium Authors. All rights reserved. 2 // 3 // Redistribution and use in source and binary forms, with or without 4 // modification, are permitted provided that the following conditions are 5 // met: 6 // 7 // * Redistributions of source code must retain the above copyright 8 // notice, this list of conditions and the following disclaimer. 9 // * Redistributions in binary form must reproduce the above 10 // copyright notice, this list of conditions and the following disclaimer 11 // in the documentation and/or other materials provided with the 12 // distribution. 13 // * Neither the name of Google Inc. nor the names of its 14 // contributors may be used to endorse or promote products derived from 15 // this software without specific prior written permission. 16 // 17 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 29 #include "energy_endpointer.h" 30 31 #include <math.h> 32 33 namespace { 34 35 // Returns the RMS (quadratic mean) of the input signal. 36 float RMS(const int16_t* samples, int num_samples) { 37 int64_t ssq_int64_t = 0; 38 int64_t sum_int64_t = 0; 39 for (int i = 0; i < num_samples; ++i) { 40 sum_int64_t += samples[i]; 41 ssq_int64_t += samples[i] * samples[i]; 42 } 43 // now convert to floats. 44 double sum = static_cast<double>(sum_int64_t); 45 sum /= num_samples; 46 double ssq = static_cast<double>(ssq_int64_t); 47 return static_cast<float>(sqrt((ssq / num_samples) - (sum * sum))); 48 } 49 50 int64_t Secs2Usecs(float seconds) { 51 return static_cast<int64_t>(0.5 + (1.0e6 * seconds)); 52 } 53 54 float GetDecibel(float value) { 55 if (value > 1.0e-100) 56 return 20 * log10(value); 57 return -2000.0; 58 } 59 60 } // namespace 61 62 namespace mozilla { 63 64 // Stores threshold-crossing histories for making decisions about the speech 65 // state. 66 class EnergyEndpointer::HistoryRing { 67 public: 68 HistoryRing() : insertion_index_(0) {} 69 70 // Resets the ring to |size| elements each with state |initial_state| 71 void SetRing(int size, bool initial_state); 72 73 // Inserts a new entry into the ring and drops the oldest entry. 74 void Insert(int64_t time_us, bool decision); 75 76 // Returns the time in microseconds of the most recently added entry. 77 int64_t EndTime() const; 78 79 // Returns the sum of all intervals during which 'decision' is true within 80 // the time in seconds specified by 'duration'. The returned interval is 81 // in seconds. 82 float RingSum(float duration_sec); 83 84 private: 85 struct DecisionPoint { 86 int64_t time_us; 87 bool decision; 88 }; 89 90 std::vector<DecisionPoint> decision_points_; 91 int insertion_index_; // Index at which the next item gets added/inserted. 92 93 HistoryRing(const HistoryRing&); 94 void operator=(const HistoryRing&); 95 }; 96 97 void EnergyEndpointer::HistoryRing::SetRing(int size, bool initial_state) { 98 insertion_index_ = 0; 99 decision_points_.clear(); 100 DecisionPoint init = { -1, initial_state }; 101 decision_points_.resize(size, init); 102 } 103 104 void EnergyEndpointer::HistoryRing::Insert(int64_t time_us, bool decision) { 105 decision_points_[insertion_index_].time_us = time_us; 106 decision_points_[insertion_index_].decision = decision; 107 insertion_index_ = (insertion_index_ + 1) % decision_points_.size(); 108 } 109 110 int64_t EnergyEndpointer::HistoryRing::EndTime() const { 111 int ind = insertion_index_ - 1; 112 if (ind < 0) 113 ind = decision_points_.size() - 1; 114 return decision_points_[ind].time_us; 115 } 116 117 float EnergyEndpointer::HistoryRing::RingSum(float duration_sec) { 118 if (decision_points_.empty()) 119 return 0.0; 120 121 int64_t sum_us = 0; 122 int ind = insertion_index_ - 1; 123 if (ind < 0) 124 ind = decision_points_.size() - 1; 125 int64_t end_us = decision_points_[ind].time_us; 126 bool is_on = decision_points_[ind].decision; 127 int64_t start_us = end_us - static_cast<int64_t>(0.5 + (1.0e6 * duration_sec)); 128 if (start_us < 0) 129 start_us = 0; 130 size_t n_summed = 1; // n points ==> (n-1) intervals 131 while ((decision_points_[ind].time_us > start_us) && 132 (n_summed < decision_points_.size())) { 133 --ind; 134 if (ind < 0) 135 ind = decision_points_.size() - 1; 136 if (is_on) 137 sum_us += end_us - decision_points_[ind].time_us; 138 is_on = decision_points_[ind].decision; 139 end_us = decision_points_[ind].time_us; 140 n_summed++; 141 } 142 143 return 1.0e-6f * sum_us; // Returns total time that was super threshold. 144 } 145 146 EnergyEndpointer::EnergyEndpointer() 147 : status_(EP_PRE_SPEECH), 148 offset_confirm_dur_sec_(0), 149 endpointer_time_us_(0), 150 fast_update_frames_(0), 151 frame_counter_(0), 152 max_window_dur_(4.0), 153 sample_rate_(0), 154 history_(new HistoryRing()), 155 decision_threshold_(0), 156 estimating_environment_(false), 157 noise_level_(0), 158 rms_adapt_(0), 159 start_lag_(0), 160 end_lag_(0), 161 user_input_start_time_us_(0) { 162 } 163 164 EnergyEndpointer::~EnergyEndpointer() { 165 } 166 167 int EnergyEndpointer::TimeToFrame(float time) const { 168 return static_cast<int32_t>(0.5 + (time / params_.frame_period())); 169 } 170 171 void EnergyEndpointer::Restart(bool reset_threshold) { 172 status_ = EP_PRE_SPEECH; 173 user_input_start_time_us_ = 0; 174 175 if (reset_threshold) { 176 decision_threshold_ = params_.decision_threshold(); 177 rms_adapt_ = decision_threshold_; 178 noise_level_ = params_.decision_threshold() / 2.0f; 179 frame_counter_ = 0; // Used for rapid initial update of levels. 180 } 181 182 // Set up the memories to hold the history windows. 183 history_->SetRing(TimeToFrame(max_window_dur_), false); 184 185 // Flag that indicates that current input should be used for 186 // estimating the environment. The user has not yet started input 187 // by e.g. pressed the push-to-talk button. By default, this is 188 // false for backward compatibility. 189 estimating_environment_ = false; 190 } 191 192 void EnergyEndpointer::Init(const EnergyEndpointerParams& params) { 193 params_ = params; 194 195 // Find the longest history interval to be used, and make the ring 196 // large enough to accommodate that number of frames. NOTE: This 197 // depends upon ep_frame_period being set correctly in the factory 198 // that did this instantiation. 199 max_window_dur_ = params_.onset_window(); 200 if (params_.speech_on_window() > max_window_dur_) 201 max_window_dur_ = params_.speech_on_window(); 202 if (params_.offset_window() > max_window_dur_) 203 max_window_dur_ = params_.offset_window(); 204 Restart(true); 205 206 offset_confirm_dur_sec_ = params_.offset_window() - 207 params_.offset_confirm_dur(); 208 if (offset_confirm_dur_sec_ < 0.0) 209 offset_confirm_dur_sec_ = 0.0; 210 211 user_input_start_time_us_ = 0; 212 213 // Flag that indicates that current input should be used for 214 // estimating the environment. The user has not yet started input 215 // by e.g. pressed the push-to-talk button. By default, this is 216 // false for backward compatibility. 217 estimating_environment_ = false; 218 // The initial value of the noise and speech levels is inconsequential. 219 // The level of the first frame will overwrite these values. 220 noise_level_ = params_.decision_threshold() / 2.0f; 221 fast_update_frames_ = 222 static_cast<int64_t>(params_.fast_update_dur() / params_.frame_period()); 223 224 frame_counter_ = 0; // Used for rapid initial update of levels. 225 226 sample_rate_ = params_.sample_rate(); 227 start_lag_ = static_cast<int>(sample_rate_ / 228 params_.max_fundamental_frequency()); 229 end_lag_ = static_cast<int>(sample_rate_ / 230 params_.min_fundamental_frequency()); 231 } 232 233 void EnergyEndpointer::StartSession() { 234 Restart(true); 235 } 236 237 void EnergyEndpointer::EndSession() { 238 status_ = EP_POST_SPEECH; 239 } 240 241 void EnergyEndpointer::SetEnvironmentEstimationMode() { 242 Restart(true); 243 estimating_environment_ = true; 244 } 245 246 void EnergyEndpointer::SetUserInputMode() { 247 estimating_environment_ = false; 248 user_input_start_time_us_ = endpointer_time_us_; 249 } 250 251 void EnergyEndpointer::ProcessAudioFrame(int64_t time_us, 252 const int16_t* samples, 253 int num_samples, 254 float* rms_out) { 255 endpointer_time_us_ = time_us; 256 float rms = RMS(samples, num_samples); 257 258 // Check that this is user input audio vs. pre-input adaptation audio. 259 // Input audio starts when the user indicates start of input, by e.g. 260 // pressing push-to-talk. Audio recieved prior to that is used to update 261 // noise and speech level estimates. 262 if (!estimating_environment_) { 263 bool decision = false; 264 if ((endpointer_time_us_ - user_input_start_time_us_) < 265 Secs2Usecs(params_.contamination_rejection_period())) { 266 decision = false; 267 //PR_LOG(GetSpeechRecognitionLog(), PR_LOG_DEBUG, ("decision: forced to false, time: %d", endpointer_time_us_)); 268 } else { 269 decision = (rms > decision_threshold_); 270 } 271 272 history_->Insert(endpointer_time_us_, decision); 273 274 switch (status_) { 275 case EP_PRE_SPEECH: 276 if (history_->RingSum(params_.onset_window()) > 277 params_.onset_detect_dur()) { 278 status_ = EP_POSSIBLE_ONSET; 279 } 280 break; 281 282 case EP_POSSIBLE_ONSET: { 283 float tsum = history_->RingSum(params_.onset_window()); 284 if (tsum > params_.onset_confirm_dur()) { 285 status_ = EP_SPEECH_PRESENT; 286 } else { // If signal is not maintained, drop back to pre-speech. 287 if (tsum <= params_.onset_detect_dur()) 288 status_ = EP_PRE_SPEECH; 289 } 290 break; 291 } 292 293 case EP_SPEECH_PRESENT: { 294 // To induce hysteresis in the state residency, we allow a 295 // smaller residency time in the on_ring, than was required to 296 // enter the SPEECH_PERSENT state. 297 float on_time = history_->RingSum(params_.speech_on_window()); 298 if (on_time < params_.on_maintain_dur()) 299 status_ = EP_POSSIBLE_OFFSET; 300 break; 301 } 302 303 case EP_POSSIBLE_OFFSET: 304 if (history_->RingSum(params_.offset_window()) <= 305 offset_confirm_dur_sec_) { 306 // Note that this offset time may be beyond the end 307 // of the input buffer in a real-time system. It will be up 308 // to the RecognizerSession to decide what to do. 309 status_ = EP_PRE_SPEECH; // Automatically reset for next utterance. 310 } else { // If speech picks up again we allow return to SPEECH_PRESENT. 311 if (history_->RingSum(params_.speech_on_window()) >= 312 params_.on_maintain_dur()) 313 status_ = EP_SPEECH_PRESENT; 314 } 315 break; 316 317 default: 318 break; 319 } 320 321 // If this is a quiet, non-speech region, slowly adapt the detection 322 // threshold to be about 6dB above the average RMS. 323 if ((!decision) && (status_ == EP_PRE_SPEECH)) { 324 decision_threshold_ = (0.98f * decision_threshold_) + (0.02f * 2 * rms); 325 rms_adapt_ = decision_threshold_; 326 } else { 327 // If this is in a speech region, adapt the decision threshold to 328 // be about 10dB below the average RMS. If the noise level is high, 329 // the threshold is pushed up. 330 // Adaptation up to a higher level is 5 times faster than decay to 331 // a lower level. 332 if ((status_ == EP_SPEECH_PRESENT) && decision) { 333 if (rms_adapt_ > rms) { 334 rms_adapt_ = (0.99f * rms_adapt_) + (0.01f * rms); 335 } else { 336 rms_adapt_ = (0.95f * rms_adapt_) + (0.05f * rms); 337 } 338 float target_threshold = 0.3f * rms_adapt_ + noise_level_; 339 decision_threshold_ = (.90f * decision_threshold_) + 340 (0.10f * target_threshold); 341 } 342 } 343 344 // Set a floor 345 if (decision_threshold_ < params_.min_decision_threshold()) 346 decision_threshold_ = params_.min_decision_threshold(); 347 } 348 349 // Update speech and noise levels. 350 UpdateLevels(rms); 351 ++frame_counter_; 352 353 if (rms_out) 354 *rms_out = GetDecibel(rms); 355 } 356 357 float EnergyEndpointer::GetNoiseLevelDb() const { 358 return GetDecibel(noise_level_); 359 } 360 361 void EnergyEndpointer::UpdateLevels(float rms) { 362 // Update quickly initially. We assume this is noise and that 363 // speech is 6dB above the noise. 364 if (frame_counter_ < fast_update_frames_) { 365 // Alpha increases from 0 to (k-1)/k where k is the number of time 366 // steps in the initial adaptation period. 367 float alpha = static_cast<float>(frame_counter_) / 368 static_cast<float>(fast_update_frames_); 369 noise_level_ = (alpha * noise_level_) + ((1 - alpha) * rms); 370 //PR_LOG(GetSpeechRecognitionLog(), PR_LOG_DEBUG, ("FAST UPDATE, frame_counter_ %d, fast_update_frames_ %d", frame_counter_, fast_update_frames_)); 371 } else { 372 // Update Noise level. The noise level adapts quickly downward, but 373 // slowly upward. The noise_level_ parameter is not currently used 374 // for threshold adaptation. It is used for UI feedback. 375 if (noise_level_ < rms) 376 noise_level_ = (0.999f * noise_level_) + (0.001f * rms); 377 else 378 noise_level_ = (0.95f * noise_level_) + (0.05f * rms); 379 } 380 if (estimating_environment_ || (frame_counter_ < fast_update_frames_)) { 381 decision_threshold_ = noise_level_ * 2; // 6dB above noise level. 382 // Set a floor 383 if (decision_threshold_ < params_.min_decision_threshold()) 384 decision_threshold_ = params_.min_decision_threshold(); 385 } 386 } 387 388 EpStatus EnergyEndpointer::Status(int64_t* status_time) const { 389 *status_time = history_->EndTime(); 390 return status_; 391 } 392 393 } // namespace mozilla