[ tor-browser ].git.dasho

endpointer.cc (7525B)
      1 // Copyright (c) 2013 The Chromium Authors. All rights reserved.
      2 //
      3 // Redistribution and use in source and binary forms, with or without
      4 // modification, are permitted provided that the following conditions are
      5 // met:
      6 //
      7 //    * Redistributions of source code must retain the above copyright
      8 // notice, this list of conditions and the following disclaimer.
      9 //    * Redistributions in binary form must reproduce the above
     10 // copyright notice, this list of conditions and the following disclaimer
     11 // in the documentation and/or other materials provided with the
     12 // distribution.
     13 //    * Neither the name of Google Inc. nor the names of its
     14 // contributors may be used to endorse or promote products derived from
     15 // this software without specific prior written permission.
     16 //
     17 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     18 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     19 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     20 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     21 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     22 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     23 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     24 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     25 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     26 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     27 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     28 
     29 #include "endpointer.h"
     30 
     31 #include "AudioSegment.h"
     32 
     33 namespace {
     34 const int kFrameRate = 200;  // 1 frame = 5ms of audio.
     35 }
     36 
     37 namespace mozilla {
     38 
     39 Endpointer::Endpointer(int sample_rate)
     40    : speech_input_possibly_complete_silence_length_us_(-1),
     41      speech_input_complete_silence_length_us_(-1),
     42      audio_frame_time_us_(0),
     43      sample_rate_(sample_rate),
     44      frame_size_(0) {
     45  Reset();
     46 
     47  frame_size_ = static_cast<int>(sample_rate / static_cast<float>(kFrameRate));
     48 
     49  speech_input_minimum_length_us_ =
     50      static_cast<int64_t>(1.7 * 1000000);
     51  speech_input_complete_silence_length_us_ =
     52      static_cast<int64_t>(0.5 * 1000000);
     53  long_speech_input_complete_silence_length_us_ = -1;
     54  long_speech_length_us_ = -1;
     55  speech_input_possibly_complete_silence_length_us_ =
     56      1 * 1000000;
     57 
     58  // Set the default configuration for Push To Talk mode.
     59  EnergyEndpointerParams ep_config;
     60  ep_config.set_frame_period(1.0f / static_cast<float>(kFrameRate));
     61  ep_config.set_frame_duration(1.0f / static_cast<float>(kFrameRate));
     62  ep_config.set_endpoint_margin(0.2f);
     63  ep_config.set_onset_window(0.15f);
     64  ep_config.set_speech_on_window(0.4f);
     65  ep_config.set_offset_window(0.15f);
     66  ep_config.set_onset_detect_dur(0.09f);
     67  ep_config.set_onset_confirm_dur(0.075f);
     68  ep_config.set_on_maintain_dur(0.10f);
     69  ep_config.set_offset_confirm_dur(0.12f);
     70  ep_config.set_decision_threshold(1000.0f);
     71  ep_config.set_min_decision_threshold(50.0f);
     72  ep_config.set_fast_update_dur(0.2f);
     73  ep_config.set_sample_rate(static_cast<float>(sample_rate));
     74  ep_config.set_min_fundamental_frequency(57.143f);
     75  ep_config.set_max_fundamental_frequency(400.0f);
     76  ep_config.set_contamination_rejection_period(0.25f);
     77  energy_endpointer_.Init(ep_config);
     78 }
     79 
     80 void Endpointer::Reset() {
     81  old_ep_status_ = EP_PRE_SPEECH;
     82  waiting_for_speech_possibly_complete_timeout_ = false;
     83  waiting_for_speech_complete_timeout_ = false;
     84  speech_previously_detected_ = false;
     85  speech_input_complete_ = false;
     86  audio_frame_time_us_ = 0; // Reset time for packets sent to endpointer.
     87  speech_end_time_us_ = -1;
     88  speech_start_time_us_ = -1;
     89 }
     90 
     91 void Endpointer::StartSession() {
     92  Reset();
     93  energy_endpointer_.StartSession();
     94 }
     95 
     96 void Endpointer::EndSession() {
     97  energy_endpointer_.EndSession();
     98 }
     99 
    100 void Endpointer::SetEnvironmentEstimationMode() {
    101  Reset();
    102  energy_endpointer_.SetEnvironmentEstimationMode();
    103 }
    104 
    105 void Endpointer::SetUserInputMode() {
    106  energy_endpointer_.SetUserInputMode();
    107 }
    108 
    109 EpStatus Endpointer::Status(int64_t *time) {
    110  return energy_endpointer_.Status(time);
    111 }
    112 
    113 EpStatus Endpointer::ProcessAudio(const AudioChunk& raw_audio, float* rms_out) {
    114  MOZ_ASSERT(raw_audio.mBufferFormat == AUDIO_FORMAT_S16, "Audio is not in 16 bit format");
    115  const int16_t* audio_data = static_cast<const int16_t*>(raw_audio.mChannelData[0]);
    116  const int num_samples = raw_audio.mDuration;
    117  EpStatus ep_status = EP_PRE_SPEECH;
    118 
    119  // Process the input data in blocks of frame_size_, dropping any incomplete
    120  // frames at the end (which is ok since typically the caller will be recording
    121  // audio in multiples of our frame size).
    122  int sample_index = 0;
    123  while (sample_index + frame_size_ <= num_samples) {
    124    // Have the endpointer process the frame.
    125    energy_endpointer_.ProcessAudioFrame(audio_frame_time_us_,
    126                                         audio_data + sample_index,
    127                                         frame_size_,
    128                                         rms_out);
    129    sample_index += frame_size_;
    130    audio_frame_time_us_ += (frame_size_ * 1000000) /
    131                         sample_rate_;
    132 
    133    // Get the status of the endpointer.
    134    int64_t ep_time;
    135    ep_status = energy_endpointer_.Status(&ep_time);
    136    if (old_ep_status_ != ep_status)
    137        fprintf(stderr, "Status changed old= %d, new= %d\n", old_ep_status_, ep_status);
    138 
    139    // Handle state changes.
    140    if ((EP_SPEECH_PRESENT == ep_status) &&
    141        (EP_POSSIBLE_ONSET == old_ep_status_)) {
    142      speech_end_time_us_ = -1;
    143      waiting_for_speech_possibly_complete_timeout_ = false;
    144      waiting_for_speech_complete_timeout_ = false;
    145      // Trigger SpeechInputDidStart event on first detection.
    146      if (false == speech_previously_detected_) {
    147        speech_previously_detected_ = true;
    148        speech_start_time_us_ = ep_time;
    149      }
    150    }
    151    if ((EP_PRE_SPEECH == ep_status) &&
    152        (EP_POSSIBLE_OFFSET == old_ep_status_)) {
    153      speech_end_time_us_ = ep_time;
    154      waiting_for_speech_possibly_complete_timeout_ = true;
    155      waiting_for_speech_complete_timeout_ = true;
    156    }
    157    if (ep_time > speech_input_minimum_length_us_) {
    158      // Speech possibly complete timeout.
    159      if ((waiting_for_speech_possibly_complete_timeout_) &&
    160          (ep_time - speech_end_time_us_ >
    161              speech_input_possibly_complete_silence_length_us_)) {
    162        waiting_for_speech_possibly_complete_timeout_ = false;
    163      }
    164      if (waiting_for_speech_complete_timeout_) {
    165        // The length of the silence timeout period can be held constant, or it
    166        // can be changed after a fixed amount of time from the beginning of
    167        // speech.
    168        bool has_stepped_silence =
    169            (long_speech_length_us_ > 0) &&
    170            (long_speech_input_complete_silence_length_us_ > 0);
    171        int64_t requested_silence_length;
    172        if (has_stepped_silence &&
    173            (ep_time - speech_start_time_us_) > long_speech_length_us_) {
    174          requested_silence_length =
    175              long_speech_input_complete_silence_length_us_;
    176        } else {
    177          requested_silence_length =
    178              speech_input_complete_silence_length_us_;
    179        }
    180 
    181        // Speech complete timeout.
    182        if ((ep_time - speech_end_time_us_) > requested_silence_length) {
    183          waiting_for_speech_complete_timeout_ = false;
    184          speech_input_complete_ = true;
    185        }
    186      }
    187    }
    188    old_ep_status_ = ep_status;
    189  }
    190  return ep_status;
    191 }
    192 
    193 }  // namespace mozilla
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE