SpeechRecognition.h (10491B)
1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 /* vim:set ts=2 sw=2 sts=2 et cindent: */ 3 /* This Source Code Form is subject to the terms of the Mozilla Public 4 * License, v. 2.0. If a copy of the MPL was not distributed with this 5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 6 7 #ifndef mozilla_dom_SpeechRecognition_h 8 #define mozilla_dom_SpeechRecognition_h 9 10 #include "AudioSegment.h" 11 #include "DOMMediaStream.h" 12 #include "MediaTrackGraph.h" 13 #include "SpeechGrammarList.h" 14 #include "SpeechRecognitionResultList.h" 15 #include "endpointer.h" 16 #include "js/TypeDecls.h" 17 #include "mozilla/DOMEventTargetHelper.h" 18 #include "mozilla/WeakPtr.h" 19 #include "mozilla/dom/BindingDeclarations.h" 20 #include "mozilla/dom/SpeechRecognitionError.h" 21 #include "nsCOMPtr.h" 22 #include "nsISpeechRecognitionService.h" 23 #include "nsITimer.h" 24 #include "nsProxyRelease.h" 25 #include "nsString.h" 26 #include "nsTArray.h" 27 #include "nsWrapperCache.h" 28 29 namespace mozilla { 30 31 namespace media { 32 class ShutdownBlocker; 33 } 34 35 namespace dom { 36 37 #define SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC \ 38 "SpeechRecognitionTest:RequestEvent" 39 #define SPEECH_RECOGNITION_TEST_END_TOPIC "SpeechRecognitionTest:End" 40 41 class GlobalObject; 42 class AudioStreamTrack; 43 class SpeechEvent; 44 class SpeechTrackListener; 45 46 LogModule* GetSpeechRecognitionLog(); 47 #define SR_LOG(...) \ 48 MOZ_LOG(GetSpeechRecognitionLog(), mozilla::LogLevel::Debug, (__VA_ARGS__)) 49 50 class SpeechRecognition final : public DOMEventTargetHelper, 51 public nsIObserver, 52 public SupportsWeakPtr { 53 public: 54 explicit SpeechRecognition(nsPIDOMWindowInner* aOwnerWindow); 55 56 NS_DECL_ISUPPORTS_INHERITED 57 NS_DECL_CYCLE_COLLECTION_CLASS_INHERITED(SpeechRecognition, 58 DOMEventTargetHelper) 59 60 NS_DECL_NSIOBSERVER 61 62 JSObject* WrapObject(JSContext* aCx, 63 JS::Handle<JSObject*> aGivenProto) override; 64 65 static already_AddRefed<SpeechRecognition> Constructor( 66 const GlobalObject& aGlobal, ErrorResult& aRv); 67 68 static already_AddRefed<SpeechRecognition> WebkitSpeechRecognition( 69 const GlobalObject& aGlobal, ErrorResult& aRv) { 70 return Constructor(aGlobal, aRv); 71 } 72 73 already_AddRefed<SpeechGrammarList> Grammars() const; 74 75 void SetGrammars(mozilla::dom::SpeechGrammarList& aArg); 76 77 void GetLang(nsString& aRetVal) const; 78 79 void SetLang(const nsAString& aArg); 80 81 bool GetContinuous(ErrorResult& aRv) const; 82 83 void SetContinuous(bool aArg, ErrorResult& aRv); 84 85 bool InterimResults() const; 86 87 void SetInterimResults(bool aArg); 88 89 uint32_t MaxAlternatives() const; 90 91 TaskQueue* GetTaskQueueForEncoding() const; 92 93 void SetMaxAlternatives(uint32_t aArg); 94 95 void GetServiceURI(nsString& aRetVal, ErrorResult& aRv) const; 96 97 void SetServiceURI(const nsAString& aArg, ErrorResult& aRv); 98 99 void Start(const Optional<NonNull<DOMMediaStream>>& aStream, 100 CallerType aCallerType, ErrorResult& aRv); 101 102 void Stop(); 103 104 void Abort(); 105 106 IMPL_EVENT_HANDLER(audiostart) 107 IMPL_EVENT_HANDLER(soundstart) 108 IMPL_EVENT_HANDLER(speechstart) 109 IMPL_EVENT_HANDLER(speechend) 110 IMPL_EVENT_HANDLER(soundend) 111 IMPL_EVENT_HANDLER(audioend) 112 IMPL_EVENT_HANDLER(result) 113 IMPL_EVENT_HANDLER(nomatch) 114 IMPL_EVENT_HANDLER(error) 115 IMPL_EVENT_HANDLER(start) 116 IMPL_EVENT_HANDLER(end) 117 118 enum EventType { 119 EVENT_START, 120 EVENT_STOP, 121 EVENT_ABORT, 122 EVENT_AUDIO_DATA, 123 EVENT_AUDIO_ERROR, 124 EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT, 125 EVENT_RECOGNITIONSERVICE_FINAL_RESULT, 126 EVENT_RECOGNITIONSERVICE_ERROR, 127 EVENT_COUNT 128 }; 129 130 void NotifyTrackAdded(const RefPtr<MediaStreamTrack>& aTrack); 131 132 class TrackListener final : public DOMMediaStream::TrackListener { 133 public: 134 NS_DECL_ISUPPORTS_INHERITED 135 NS_DECL_CYCLE_COLLECTION_CLASS_INHERITED(TrackListener, 136 DOMMediaStream::TrackListener) 137 explicit TrackListener(SpeechRecognition* aSpeechRecognition) 138 : mSpeechRecognition(aSpeechRecognition) {} 139 void NotifyTrackAdded(const RefPtr<MediaStreamTrack>& aTrack) override { 140 mSpeechRecognition->NotifyTrackAdded(aTrack); 141 } 142 143 private: 144 virtual ~TrackListener() = default; 145 RefPtr<SpeechRecognition> mSpeechRecognition; 146 }; 147 148 // aMessage should be valid UTF-8, but invalid UTF-8 byte sequences are 149 // replaced with the REPLACEMENT CHARACTER on conversion to UTF-16. 150 void DispatchError(EventType aErrorType, 151 SpeechRecognitionErrorCode aErrorCode, 152 const nsACString& aMessage); 153 template <int N> 154 void DispatchError(EventType aErrorType, 155 SpeechRecognitionErrorCode aErrorCode, 156 const char (&aMessage)[N]) { 157 DispatchError(aErrorType, aErrorCode, nsLiteralCString(aMessage)); 158 } 159 uint32_t FillSamplesBuffer(const int16_t* aSamples, uint32_t aSampleCount); 160 uint32_t SplitSamplesBuffer(const int16_t* aSamplesBuffer, 161 uint32_t aSampleCount, 162 nsTArray<RefPtr<SharedBuffer>>& aResult); 163 AudioSegment* CreateAudioSegment(nsTArray<RefPtr<SharedBuffer>>& aChunks); 164 void FeedAudioData(nsMainThreadPtrHandle<SpeechRecognition>& aRecognition, 165 already_AddRefed<SharedBuffer> aSamples, 166 uint32_t aDuration, MediaTrackListener* aProvider, 167 TrackRate aTrackRate); 168 169 friend class SpeechEvent; 170 171 private: 172 virtual ~SpeechRecognition(); 173 174 enum FSMState { 175 STATE_IDLE, 176 STATE_STARTING, 177 STATE_ESTIMATING, 178 STATE_WAITING_FOR_SPEECH, 179 STATE_RECOGNIZING, 180 STATE_WAITING_FOR_RESULT, 181 STATE_ABORTING, 182 STATE_COUNT 183 }; 184 185 void SetState(FSMState state); 186 bool StateBetween(FSMState begin, FSMState end); 187 188 bool SetRecognitionService(ErrorResult& aRv); 189 bool ValidateAndSetGrammarList(ErrorResult& aRv); 190 191 NS_IMETHOD StartRecording(RefPtr<AudioStreamTrack>& aDOMStream); 192 RefPtr<GenericNonExclusivePromise> StopRecording(); 193 194 uint32_t ProcessAudioSegment(AudioSegment* aSegment, TrackRate aTrackRate); 195 void NotifyError(SpeechEvent* aEvent); 196 197 void ProcessEvent(SpeechEvent* aEvent); 198 void Transition(SpeechEvent* aEvent); 199 200 void Reset(); 201 void ResetAndEnd(); 202 void WaitForAudioData(SpeechEvent* aEvent); 203 void StartedAudioCapture(SpeechEvent* aEvent); 204 void StopRecordingAndRecognize(SpeechEvent* aEvent); 205 void WaitForEstimation(SpeechEvent* aEvent); 206 void DetectSpeech(SpeechEvent* aEvent); 207 void WaitForSpeechEnd(SpeechEvent* aEvent); 208 void NotifyFinalResult(SpeechEvent* aEvent); 209 void DoNothing(SpeechEvent* aEvent); 210 void AbortSilently(SpeechEvent* aEvent); 211 void AbortError(SpeechEvent* aEvent); 212 213 RefPtr<DOMMediaStream> mStream; 214 RefPtr<AudioStreamTrack> mTrack; 215 bool mTrackIsOwned = false; 216 RefPtr<GenericNonExclusivePromise> mStopRecordingPromise; 217 RefPtr<SpeechTrackListener> mSpeechListener; 218 nsCOMPtr<nsISpeechRecognitionService> mRecognitionService; 219 RefPtr<media::ShutdownBlocker> mShutdownBlocker; 220 // TaskQueue responsible for pre-processing the samples by the service 221 // it runs in a separate thread from the main thread 222 RefPtr<TaskQueue> mEncodeTaskQueue; 223 224 // A generation ID of the MediaStream a started session is for, so that 225 // a gUM request that resolves after the session has stopped, and a new 226 // one has started, can exit early. Main thread only. Can wrap. 227 uint8_t mStreamGeneration = 0; 228 229 FSMState mCurrentState; 230 231 Endpointer mEndpointer; 232 uint32_t mEstimationSamples; 233 234 uint32_t mAudioSamplesPerChunk; 235 236 // maximum amount of seconds the engine will wait for voice 237 // until returning a 'no speech detected' error 238 uint32_t mSpeechDetectionTimeoutMs; 239 240 // buffer holds one chunk of mAudioSamplesPerChunk 241 // samples before feeding it to mEndpointer 242 RefPtr<SharedBuffer> mAudioSamplesBuffer; 243 uint32_t mBufferedSamples; 244 245 nsCOMPtr<nsITimer> mSpeechDetectionTimer; 246 bool mAborted; 247 248 nsString mLang; 249 250 RefPtr<SpeechGrammarList> mSpeechGrammarList; 251 252 // private flag used to hold if the user called the setContinuous() method 253 // of the API 254 bool mContinuous; 255 256 // WebSpeechAPI (http://bit.ly/1gIl7DC) states: 257 // 258 // 1. Default value MUST be false 259 // 2. If true, interim results SHOULD be returned 260 // 3. If false, interim results MUST NOT be returned 261 // 262 // Pocketsphinx does not return interm results; so, defaulting 263 // mInterimResults to false, then ignoring its subsequent value 264 // is a conforming implementation. 265 bool mInterimResults; 266 267 // WebSpeechAPI (http://bit.ly/1JAiqeo) states: 268 // 269 // 1. Default value is 1 270 // 2. Subsequent value is the "maximum number of SpeechRecognitionAlternatives 271 // per result" 272 // 273 // Pocketsphinx can only return at maximum a single 274 // SpeechRecognitionAlternative per SpeechRecognitionResult. So defaulting 275 // mMaxAlternatives to 1, for all non zero values ignoring mMaxAlternatives 276 // while for a 0 value returning no SpeechRecognitionAlternative per result is 277 // a conforming implementation. 278 uint32_t mMaxAlternatives; 279 280 RefPtr<TrackListener> mListener; 281 282 void ProcessTestEventRequest(nsISupports* aSubject, 283 const nsAString& aEventName); 284 285 const char* GetName(FSMState aId); 286 const char* GetName(SpeechEvent* aEvent); 287 }; 288 289 class SpeechEvent : public Runnable { 290 public: 291 SpeechEvent(SpeechRecognition* aRecognition, 292 SpeechRecognition::EventType aType); 293 SpeechEvent(nsMainThreadPtrHandle<SpeechRecognition>& aRecognition, 294 SpeechRecognition::EventType aType); 295 296 ~SpeechEvent(); 297 298 NS_IMETHOD Run() override; 299 AudioSegment* mAudioSegment; 300 RefPtr<SpeechRecognitionResultList> 301 mRecognitionResultList; // TODO: make this a session being passed which 302 // also has index and stuff 303 RefPtr<SpeechRecognitionError> mError; 304 305 friend class SpeechRecognition; 306 307 private: 308 nsMainThreadPtrHandle<SpeechRecognition> mRecognition; 309 310 // for AUDIO_DATA events, keep a reference to the provider 311 // of the data (i.e., the SpeechTrackListener) to ensure it 312 // is kept alive (and keeps SpeechRecognition alive) until this 313 // event gets processed. 314 RefPtr<MediaTrackListener> mProvider; 315 SpeechRecognition::EventType mType; 316 TrackRate mTrackRate; 317 }; 318 319 } // namespace dom 320 321 inline nsISupports* ToSupports(dom::SpeechRecognition* aRec) { 322 return ToSupports(static_cast<DOMEventTargetHelper*>(aRec)); 323 } 324 325 } // namespace mozilla 326 327 #endif