[ tor-browser ].git.dasho

endpointer.h (7085B)
      1 // Copyright (c) 2013 The Chromium Authors. All rights reserved.
      2 //
      3 // Redistribution and use in source and binary forms, with or without
      4 // modification, are permitted provided that the following conditions are
      5 // met:
      6 //
      7 //    * Redistributions of source code must retain the above copyright
      8 // notice, this list of conditions and the following disclaimer.
      9 //    * Redistributions in binary form must reproduce the above
     10 // copyright notice, this list of conditions and the following disclaimer
     11 // in the documentation and/or other materials provided with the
     12 // distribution.
     13 //    * Neither the name of Google Inc. nor the names of its
     14 // contributors may be used to endorse or promote products derived from
     15 // this software without specific prior written permission.
     16 //
     17 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     18 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     19 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     20 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     21 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     22 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     23 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     24 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     25 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     26 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     27 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     28 
     29 #ifndef CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_
     30 #define CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_
     31 
     32 #include "energy_endpointer.h"
     33 
     34 namespace mozilla {
     35 
     36 struct AudioChunk;
     37 
     38 // A simple interface to the underlying energy-endpointer implementation, this
     39 // class lets callers provide audio as being recorded and let them poll to find
     40 // when the user has stopped speaking.
     41 //
     42 // There are two events that may trigger the end of speech:
     43 //
     44 // speechInputPossiblyComplete event:
     45 //
     46 // Signals that silence/noise has  been detected for a *short* amount of
     47 // time after some speech has been detected. It can be used for low latency
     48 // UI feedback. To disable it, set it to a large amount.
     49 //
     50 // speechInputComplete event:
     51 //
     52 // This event is intended to signal end of input and to stop recording.
     53 // The amount of time to wait after speech is set by
     54 // speech_input_complete_silence_length_ and optionally two other
     55 // parameters (see below).
     56 // This time can be held constant, or can change as more speech is detected.
     57 // In the latter case, the time changes after a set amount of time from the
     58 // *beginning* of speech.  This is motivated by the expectation that there
     59 // will be two distinct types of inputs: short search queries and longer
     60 // dictation style input.
     61 //
     62 // Three parameters are used to define the piecewise constant timeout function.
     63 // The timeout length is speech_input_complete_silence_length until
     64 // long_speech_length, when it changes to
     65 // long_speech_input_complete_silence_length.
     66 class Endpointer {
     67 public:
     68  explicit Endpointer(int sample_rate);
     69 
     70  // Start the endpointer. This should be called at the beginning of a session.
     71  void StartSession();
     72 
     73  // Stop the endpointer.
     74  void EndSession();
     75 
     76  // Start environment estimation. Audio will be used for environment estimation
     77  // i.e. noise level estimation.
     78  void SetEnvironmentEstimationMode();
     79 
     80  // Start user input. This should be called when the user indicates start of
     81  // input, e.g. by pressing a button.
     82  void SetUserInputMode();
     83 
     84  // Process a segment of audio, which may be more than one frame.
     85  // The status of the last frame will be returned.
     86  EpStatus ProcessAudio(const AudioChunk& raw_audio, float* rms_out);
     87 
     88  // Get the status of the endpointer.
     89  EpStatus Status(int64_t *time_us);
     90 
     91  // Get the expected frame size for audio chunks. Audio chunks are expected
     92  // to contain a number of samples that is a multiple of this number, and extra
     93  // samples will be dropped.
     94  int32_t FrameSize() const {
     95    return frame_size_;
     96  }
     97 
     98  // Returns true if the endpointer detected reasonable audio levels above
     99  // background noise which could be user speech, false if not.
    100  bool DidStartReceivingSpeech() const {
    101    return speech_previously_detected_;
    102  }
    103 
    104  bool IsEstimatingEnvironment() const {
    105    return energy_endpointer_.estimating_environment();
    106  }
    107 
    108  void set_speech_input_complete_silence_length(int64_t time_us) {
    109    speech_input_complete_silence_length_us_ = time_us;
    110  }
    111 
    112  void set_long_speech_input_complete_silence_length(int64_t time_us) {
    113    long_speech_input_complete_silence_length_us_ = time_us;
    114  }
    115 
    116  void set_speech_input_possibly_complete_silence_length(int64_t time_us) {
    117    speech_input_possibly_complete_silence_length_us_ = time_us;
    118  }
    119 
    120  void set_long_speech_length(int64_t time_us) {
    121    long_speech_length_us_ = time_us;
    122  }
    123 
    124  bool speech_input_complete() const {
    125    return speech_input_complete_;
    126  }
    127 
    128  // RMS background noise level in dB.
    129  float NoiseLevelDb() const { return energy_endpointer_.GetNoiseLevelDb(); }
    130 
    131 private:
    132  // Reset internal states. Helper method common to initial input utterance
    133  // and following input utternaces.
    134  void Reset();
    135 
    136  // Minimum allowable length of speech input.
    137  int64_t speech_input_minimum_length_us_;
    138 
    139  // The speechInputPossiblyComplete event signals that silence/noise has been
    140  // detected for a *short* amount of time after some speech has been detected.
    141  // This proporty specifies the time period.
    142  int64_t speech_input_possibly_complete_silence_length_us_;
    143 
    144  // The speechInputComplete event signals that silence/noise has been
    145  // detected for a *long* amount of time after some speech has been detected.
    146  // This property specifies the time period.
    147  int64_t speech_input_complete_silence_length_us_;
    148 
    149  // Same as above, this specifies the required silence period after speech
    150  // detection. This period is used instead of
    151  // speech_input_complete_silence_length_ when the utterance is longer than
    152  // long_speech_length_. This parameter is optional.
    153  int64_t long_speech_input_complete_silence_length_us_;
    154 
    155  // The period of time after which the endpointer should consider
    156  // long_speech_input_complete_silence_length_ as a valid silence period
    157  // instead of speech_input_complete_silence_length_. This parameter is
    158  // optional.
    159  int64_t long_speech_length_us_;
    160 
    161  // First speech onset time, used in determination of speech complete timeout.
    162  int64_t speech_start_time_us_;
    163 
    164  // Most recent end time, used in determination of speech complete timeout.
    165  int64_t speech_end_time_us_;
    166 
    167  int64_t audio_frame_time_us_;
    168  EpStatus old_ep_status_;
    169  bool waiting_for_speech_possibly_complete_timeout_;
    170  bool waiting_for_speech_complete_timeout_;
    171  bool speech_previously_detected_;
    172  bool speech_input_complete_;
    173  EnergyEndpointer energy_endpointer_;
    174  int sample_rate_;
    175  int32_t frame_size_;
    176 };
    177 
    178 }  // namespace mozilla
    179 
    180 #endif  // CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE