tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

SpeechRecognition.h (10491B)


      1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
      2 /* vim:set ts=2 sw=2 sts=2 et cindent: */
      3 /* This Source Code Form is subject to the terms of the Mozilla Public
      4 * License, v. 2.0. If a copy of the MPL was not distributed with this
      5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      6 
      7 #ifndef mozilla_dom_SpeechRecognition_h
      8 #define mozilla_dom_SpeechRecognition_h
      9 
     10 #include "AudioSegment.h"
     11 #include "DOMMediaStream.h"
     12 #include "MediaTrackGraph.h"
     13 #include "SpeechGrammarList.h"
     14 #include "SpeechRecognitionResultList.h"
     15 #include "endpointer.h"
     16 #include "js/TypeDecls.h"
     17 #include "mozilla/DOMEventTargetHelper.h"
     18 #include "mozilla/WeakPtr.h"
     19 #include "mozilla/dom/BindingDeclarations.h"
     20 #include "mozilla/dom/SpeechRecognitionError.h"
     21 #include "nsCOMPtr.h"
     22 #include "nsISpeechRecognitionService.h"
     23 #include "nsITimer.h"
     24 #include "nsProxyRelease.h"
     25 #include "nsString.h"
     26 #include "nsTArray.h"
     27 #include "nsWrapperCache.h"
     28 
     29 namespace mozilla {
     30 
     31 namespace media {
     32 class ShutdownBlocker;
     33 }
     34 
     35 namespace dom {
     36 
     37 #define SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC \
     38  "SpeechRecognitionTest:RequestEvent"
     39 #define SPEECH_RECOGNITION_TEST_END_TOPIC "SpeechRecognitionTest:End"
     40 
     41 class GlobalObject;
     42 class AudioStreamTrack;
     43 class SpeechEvent;
     44 class SpeechTrackListener;
     45 
     46 LogModule* GetSpeechRecognitionLog();
     47 #define SR_LOG(...) \
     48  MOZ_LOG(GetSpeechRecognitionLog(), mozilla::LogLevel::Debug, (__VA_ARGS__))
     49 
     50 class SpeechRecognition final : public DOMEventTargetHelper,
     51                                public nsIObserver,
     52                                public SupportsWeakPtr {
     53 public:
     54  explicit SpeechRecognition(nsPIDOMWindowInner* aOwnerWindow);
     55 
     56  NS_DECL_ISUPPORTS_INHERITED
     57  NS_DECL_CYCLE_COLLECTION_CLASS_INHERITED(SpeechRecognition,
     58                                           DOMEventTargetHelper)
     59 
     60  NS_DECL_NSIOBSERVER
     61 
     62  JSObject* WrapObject(JSContext* aCx,
     63                       JS::Handle<JSObject*> aGivenProto) override;
     64 
     65  static already_AddRefed<SpeechRecognition> Constructor(
     66      const GlobalObject& aGlobal, ErrorResult& aRv);
     67 
     68  static already_AddRefed<SpeechRecognition> WebkitSpeechRecognition(
     69      const GlobalObject& aGlobal, ErrorResult& aRv) {
     70    return Constructor(aGlobal, aRv);
     71  }
     72 
     73  already_AddRefed<SpeechGrammarList> Grammars() const;
     74 
     75  void SetGrammars(mozilla::dom::SpeechGrammarList& aArg);
     76 
     77  void GetLang(nsString& aRetVal) const;
     78 
     79  void SetLang(const nsAString& aArg);
     80 
     81  bool GetContinuous(ErrorResult& aRv) const;
     82 
     83  void SetContinuous(bool aArg, ErrorResult& aRv);
     84 
     85  bool InterimResults() const;
     86 
     87  void SetInterimResults(bool aArg);
     88 
     89  uint32_t MaxAlternatives() const;
     90 
     91  TaskQueue* GetTaskQueueForEncoding() const;
     92 
     93  void SetMaxAlternatives(uint32_t aArg);
     94 
     95  void GetServiceURI(nsString& aRetVal, ErrorResult& aRv) const;
     96 
     97  void SetServiceURI(const nsAString& aArg, ErrorResult& aRv);
     98 
     99  void Start(const Optional<NonNull<DOMMediaStream>>& aStream,
    100             CallerType aCallerType, ErrorResult& aRv);
    101 
    102  void Stop();
    103 
    104  void Abort();
    105 
    106  IMPL_EVENT_HANDLER(audiostart)
    107  IMPL_EVENT_HANDLER(soundstart)
    108  IMPL_EVENT_HANDLER(speechstart)
    109  IMPL_EVENT_HANDLER(speechend)
    110  IMPL_EVENT_HANDLER(soundend)
    111  IMPL_EVENT_HANDLER(audioend)
    112  IMPL_EVENT_HANDLER(result)
    113  IMPL_EVENT_HANDLER(nomatch)
    114  IMPL_EVENT_HANDLER(error)
    115  IMPL_EVENT_HANDLER(start)
    116  IMPL_EVENT_HANDLER(end)
    117 
    118  enum EventType {
    119    EVENT_START,
    120    EVENT_STOP,
    121    EVENT_ABORT,
    122    EVENT_AUDIO_DATA,
    123    EVENT_AUDIO_ERROR,
    124    EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT,
    125    EVENT_RECOGNITIONSERVICE_FINAL_RESULT,
    126    EVENT_RECOGNITIONSERVICE_ERROR,
    127    EVENT_COUNT
    128  };
    129 
    130  void NotifyTrackAdded(const RefPtr<MediaStreamTrack>& aTrack);
    131 
    132  class TrackListener final : public DOMMediaStream::TrackListener {
    133   public:
    134    NS_DECL_ISUPPORTS_INHERITED
    135    NS_DECL_CYCLE_COLLECTION_CLASS_INHERITED(TrackListener,
    136                                             DOMMediaStream::TrackListener)
    137    explicit TrackListener(SpeechRecognition* aSpeechRecognition)
    138        : mSpeechRecognition(aSpeechRecognition) {}
    139    void NotifyTrackAdded(const RefPtr<MediaStreamTrack>& aTrack) override {
    140      mSpeechRecognition->NotifyTrackAdded(aTrack);
    141    }
    142 
    143   private:
    144    virtual ~TrackListener() = default;
    145    RefPtr<SpeechRecognition> mSpeechRecognition;
    146  };
    147 
    148  // aMessage should be valid UTF-8, but invalid UTF-8 byte sequences are
    149  // replaced with the REPLACEMENT CHARACTER on conversion to UTF-16.
    150  void DispatchError(EventType aErrorType,
    151                     SpeechRecognitionErrorCode aErrorCode,
    152                     const nsACString& aMessage);
    153  template <int N>
    154  void DispatchError(EventType aErrorType,
    155                     SpeechRecognitionErrorCode aErrorCode,
    156                     const char (&aMessage)[N]) {
    157    DispatchError(aErrorType, aErrorCode, nsLiteralCString(aMessage));
    158  }
    159  uint32_t FillSamplesBuffer(const int16_t* aSamples, uint32_t aSampleCount);
    160  uint32_t SplitSamplesBuffer(const int16_t* aSamplesBuffer,
    161                              uint32_t aSampleCount,
    162                              nsTArray<RefPtr<SharedBuffer>>& aResult);
    163  AudioSegment* CreateAudioSegment(nsTArray<RefPtr<SharedBuffer>>& aChunks);
    164  void FeedAudioData(nsMainThreadPtrHandle<SpeechRecognition>& aRecognition,
    165                     already_AddRefed<SharedBuffer> aSamples,
    166                     uint32_t aDuration, MediaTrackListener* aProvider,
    167                     TrackRate aTrackRate);
    168 
    169  friend class SpeechEvent;
    170 
    171 private:
    172  virtual ~SpeechRecognition();
    173 
    174  enum FSMState {
    175    STATE_IDLE,
    176    STATE_STARTING,
    177    STATE_ESTIMATING,
    178    STATE_WAITING_FOR_SPEECH,
    179    STATE_RECOGNIZING,
    180    STATE_WAITING_FOR_RESULT,
    181    STATE_ABORTING,
    182    STATE_COUNT
    183  };
    184 
    185  void SetState(FSMState state);
    186  bool StateBetween(FSMState begin, FSMState end);
    187 
    188  bool SetRecognitionService(ErrorResult& aRv);
    189  bool ValidateAndSetGrammarList(ErrorResult& aRv);
    190 
    191  NS_IMETHOD StartRecording(RefPtr<AudioStreamTrack>& aDOMStream);
    192  RefPtr<GenericNonExclusivePromise> StopRecording();
    193 
    194  uint32_t ProcessAudioSegment(AudioSegment* aSegment, TrackRate aTrackRate);
    195  void NotifyError(SpeechEvent* aEvent);
    196 
    197  void ProcessEvent(SpeechEvent* aEvent);
    198  void Transition(SpeechEvent* aEvent);
    199 
    200  void Reset();
    201  void ResetAndEnd();
    202  void WaitForAudioData(SpeechEvent* aEvent);
    203  void StartedAudioCapture(SpeechEvent* aEvent);
    204  void StopRecordingAndRecognize(SpeechEvent* aEvent);
    205  void WaitForEstimation(SpeechEvent* aEvent);
    206  void DetectSpeech(SpeechEvent* aEvent);
    207  void WaitForSpeechEnd(SpeechEvent* aEvent);
    208  void NotifyFinalResult(SpeechEvent* aEvent);
    209  void DoNothing(SpeechEvent* aEvent);
    210  void AbortSilently(SpeechEvent* aEvent);
    211  void AbortError(SpeechEvent* aEvent);
    212 
    213  RefPtr<DOMMediaStream> mStream;
    214  RefPtr<AudioStreamTrack> mTrack;
    215  bool mTrackIsOwned = false;
    216  RefPtr<GenericNonExclusivePromise> mStopRecordingPromise;
    217  RefPtr<SpeechTrackListener> mSpeechListener;
    218  nsCOMPtr<nsISpeechRecognitionService> mRecognitionService;
    219  RefPtr<media::ShutdownBlocker> mShutdownBlocker;
    220  // TaskQueue responsible for pre-processing the samples by the service
    221  // it runs in a separate thread from the main thread
    222  RefPtr<TaskQueue> mEncodeTaskQueue;
    223 
    224  // A generation ID of the MediaStream a started session is for, so that
    225  // a gUM request that resolves after the session has stopped, and a new
    226  // one has started, can exit early. Main thread only. Can wrap.
    227  uint8_t mStreamGeneration = 0;
    228 
    229  FSMState mCurrentState;
    230 
    231  Endpointer mEndpointer;
    232  uint32_t mEstimationSamples;
    233 
    234  uint32_t mAudioSamplesPerChunk;
    235 
    236  // maximum amount of seconds the engine will wait for voice
    237  // until returning a 'no speech detected' error
    238  uint32_t mSpeechDetectionTimeoutMs;
    239 
    240  // buffer holds one chunk of mAudioSamplesPerChunk
    241  // samples before feeding it to mEndpointer
    242  RefPtr<SharedBuffer> mAudioSamplesBuffer;
    243  uint32_t mBufferedSamples;
    244 
    245  nsCOMPtr<nsITimer> mSpeechDetectionTimer;
    246  bool mAborted;
    247 
    248  nsString mLang;
    249 
    250  RefPtr<SpeechGrammarList> mSpeechGrammarList;
    251 
    252  // private flag used to hold if the user called the setContinuous() method
    253  // of the API
    254  bool mContinuous;
    255 
    256  // WebSpeechAPI (http://bit.ly/1gIl7DC) states:
    257  //
    258  // 1. Default value MUST be false
    259  // 2. If true, interim results SHOULD be returned
    260  // 3. If false, interim results MUST NOT be returned
    261  //
    262  // Pocketsphinx does not return interm results; so, defaulting
    263  // mInterimResults to false, then ignoring its subsequent value
    264  // is a conforming implementation.
    265  bool mInterimResults;
    266 
    267  // WebSpeechAPI (http://bit.ly/1JAiqeo) states:
    268  //
    269  // 1. Default value is 1
    270  // 2. Subsequent value is the "maximum number of SpeechRecognitionAlternatives
    271  // per result"
    272  //
    273  // Pocketsphinx can only return at maximum a single
    274  // SpeechRecognitionAlternative per SpeechRecognitionResult. So defaulting
    275  // mMaxAlternatives to 1, for all non zero values ignoring mMaxAlternatives
    276  // while for a 0 value returning no SpeechRecognitionAlternative per result is
    277  // a conforming implementation.
    278  uint32_t mMaxAlternatives;
    279 
    280  RefPtr<TrackListener> mListener;
    281 
    282  void ProcessTestEventRequest(nsISupports* aSubject,
    283                               const nsAString& aEventName);
    284 
    285  const char* GetName(FSMState aId);
    286  const char* GetName(SpeechEvent* aEvent);
    287 };
    288 
    289 class SpeechEvent : public Runnable {
    290 public:
    291  SpeechEvent(SpeechRecognition* aRecognition,
    292              SpeechRecognition::EventType aType);
    293  SpeechEvent(nsMainThreadPtrHandle<SpeechRecognition>& aRecognition,
    294              SpeechRecognition::EventType aType);
    295 
    296  ~SpeechEvent();
    297 
    298  NS_IMETHOD Run() override;
    299  AudioSegment* mAudioSegment;
    300  RefPtr<SpeechRecognitionResultList>
    301      mRecognitionResultList;  // TODO: make this a session being passed which
    302                               // also has index and stuff
    303  RefPtr<SpeechRecognitionError> mError;
    304 
    305  friend class SpeechRecognition;
    306 
    307 private:
    308  nsMainThreadPtrHandle<SpeechRecognition> mRecognition;
    309 
    310  // for AUDIO_DATA events, keep a reference to the provider
    311  // of the data (i.e., the SpeechTrackListener) to ensure it
    312  // is kept alive (and keeps SpeechRecognition alive) until this
    313  // event gets processed.
    314  RefPtr<MediaTrackListener> mProvider;
    315  SpeechRecognition::EventType mType;
    316  TrackRate mTrackRate;
    317 };
    318 
    319 }  // namespace dom
    320 
    321 inline nsISupports* ToSupports(dom::SpeechRecognition* aRec) {
    322  return ToSupports(static_cast<DOMEventTargetHelper*>(aRec));
    323 }
    324 
    325 }  // namespace mozilla
    326 
    327 #endif