SpeechRecognition.cpp (37973B)
1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 /* vim:set ts=2 sw=2 sts=2 et cindent: */ 3 /* This Source Code Form is subject to the terms of the Mozilla Public 4 * License, v. 2.0. If a copy of the MPL was not distributed with this 5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 6 7 #include "SpeechRecognition.h" 8 9 #include <algorithm> 10 11 #include "AudioSegment.h" 12 #include "MediaEnginePrefs.h" 13 #include "SpeechTrackListener.h" 14 #include "VideoUtils.h" 15 #include "endpointer.h" 16 #include "mozilla/AbstractThread.h" 17 #include "mozilla/MediaManager.h" 18 #include "mozilla/Preferences.h" 19 #include "mozilla/ResultVariant.h" 20 #include "mozilla/Services.h" 21 #include "mozilla/StaticPrefs_media.h" 22 #include "mozilla/dom/AudioStreamTrack.h" 23 #include "mozilla/dom/BindingUtils.h" 24 #include "mozilla/dom/Document.h" 25 #include "mozilla/dom/Element.h" 26 #include "mozilla/dom/MediaStreamError.h" 27 #include "mozilla/dom/MediaStreamTrackBinding.h" 28 #include "mozilla/dom/RootedDictionary.h" 29 #include "mozilla/dom/SpeechGrammar.h" 30 #include "mozilla/dom/SpeechRecognitionBinding.h" 31 #include "mozilla/dom/SpeechRecognitionEvent.h" 32 #include "nsCOMPtr.h" 33 #include "nsComponentManagerUtils.h" 34 #include "nsContentUtils.h" 35 #include "nsCycleCollectionParticipant.h" 36 #include "nsGlobalWindowInner.h" 37 #include "nsIObserverService.h" 38 #include "nsIPermissionManager.h" 39 #include "nsIPrincipal.h" 40 #include "nsPIDOMWindow.h" 41 #include "nsQueryObject.h" 42 #include "nsServiceManagerUtils.h" 43 44 // Undo the windows.h damage 45 #if defined(XP_WIN) && defined(GetMessage) 46 # undef GetMessage 47 #endif 48 49 namespace mozilla::dom { 50 51 #define PREFERENCE_DEFAULT_RECOGNITION_SERVICE "media.webspeech.service.default" 52 #define DEFAULT_RECOGNITION_SERVICE "online" 53 54 #define PREFERENCE_ENDPOINTER_SILENCE_LENGTH "media.webspeech.silence_length" 55 #define PREFERENCE_ENDPOINTER_LONG_SILENCE_LENGTH \ 56 "media.webspeech.long_silence_length" 57 #define PREFERENCE_ENDPOINTER_LONG_SPEECH_LENGTH \ 58 "media.webspeech.long_speech_length" 59 #define PREFERENCE_SPEECH_DETECTION_TIMEOUT_MS \ 60 "media.webspeech.recognition.timeout" 61 62 static const uint32_t kSAMPLE_RATE = 16000; 63 64 // number of frames corresponding to 300ms of audio to send to endpointer while 65 // it's in environment estimation mode 66 // kSAMPLE_RATE frames = 1s, kESTIMATION_FRAMES frames = 300ms 67 static const uint32_t kESTIMATION_SAMPLES = 300 * kSAMPLE_RATE / 1000; 68 69 LogModule* GetSpeechRecognitionLog() { 70 static LazyLogModule sLog("SpeechRecognition"); 71 return sLog; 72 } 73 #define SR_LOG(...) \ 74 MOZ_LOG(GetSpeechRecognitionLog(), mozilla::LogLevel::Debug, (__VA_ARGS__)) 75 76 namespace { 77 class SpeechRecognitionShutdownBlocker : public media::ShutdownBlocker { 78 public: 79 SpeechRecognitionShutdownBlocker(SpeechRecognition* aRecognition, 80 const nsString& aName) 81 : media::ShutdownBlocker(aName), mRecognition(aRecognition) {} 82 83 NS_IMETHOD BlockShutdown(nsIAsyncShutdownClient*) override { 84 MOZ_ASSERT(NS_IsMainThread()); 85 // AbortSilently will eventually clear the blocker. 86 mRecognition->Abort(); 87 return NS_OK; 88 } 89 90 private: 91 const RefPtr<SpeechRecognition> mRecognition; 92 }; 93 94 enum class ServiceCreationError { 95 ServiceNotFound, 96 }; 97 98 Result<nsCOMPtr<nsISpeechRecognitionService>, ServiceCreationError> 99 CreateSpeechRecognitionService(nsPIDOMWindowInner* aWindow, 100 SpeechRecognition* aRecognition, 101 const nsAString& aLang) { 102 nsAutoCString speechRecognitionServiceCID; 103 104 nsAutoCString prefValue; 105 Preferences::GetCString(PREFERENCE_DEFAULT_RECOGNITION_SERVICE, prefValue); 106 nsAutoCString speechRecognitionService; 107 108 if (!prefValue.IsEmpty()) { 109 speechRecognitionService = prefValue; 110 } else { 111 speechRecognitionService = DEFAULT_RECOGNITION_SERVICE; 112 } 113 114 if (StaticPrefs::media_webspeech_test_fake_recognition_service()) { 115 speechRecognitionServiceCID = 116 NS_SPEECH_RECOGNITION_SERVICE_CONTRACTID_PREFIX "fake"; 117 } else { 118 speechRecognitionServiceCID = 119 nsLiteralCString(NS_SPEECH_RECOGNITION_SERVICE_CONTRACTID_PREFIX) + 120 speechRecognitionService; 121 } 122 123 nsresult rv; 124 nsCOMPtr<nsISpeechRecognitionService> recognitionService; 125 recognitionService = 126 do_CreateInstance(speechRecognitionServiceCID.get(), &rv); 127 if (!recognitionService) { 128 return Err(ServiceCreationError::ServiceNotFound); 129 } 130 131 return recognitionService; 132 } 133 } // namespace 134 135 NS_IMPL_CYCLE_COLLECTION_WEAK_PTR_INHERITED(SpeechRecognition, 136 DOMEventTargetHelper, mStream, 137 mTrack, mRecognitionService, 138 mSpeechGrammarList, mListener) 139 140 NS_INTERFACE_MAP_BEGIN_CYCLE_COLLECTION(SpeechRecognition) 141 NS_INTERFACE_MAP_ENTRY(nsIObserver) 142 NS_INTERFACE_MAP_END_INHERITING(DOMEventTargetHelper) 143 144 NS_IMPL_ADDREF_INHERITED(SpeechRecognition, DOMEventTargetHelper) 145 NS_IMPL_RELEASE_INHERITED(SpeechRecognition, DOMEventTargetHelper) 146 147 NS_IMPL_CYCLE_COLLECTION_INHERITED(SpeechRecognition::TrackListener, 148 DOMMediaStream::TrackListener, 149 mSpeechRecognition) 150 NS_IMPL_ADDREF_INHERITED(SpeechRecognition::TrackListener, 151 DOMMediaStream::TrackListener) 152 NS_IMPL_RELEASE_INHERITED(SpeechRecognition::TrackListener, 153 DOMMediaStream::TrackListener) 154 NS_INTERFACE_MAP_BEGIN_CYCLE_COLLECTION(SpeechRecognition::TrackListener) 155 NS_INTERFACE_MAP_END_INHERITING(DOMMediaStream::TrackListener) 156 157 SpeechRecognition::SpeechRecognition(nsPIDOMWindowInner* aOwnerWindow) 158 : DOMEventTargetHelper(aOwnerWindow), 159 mEndpointer(kSAMPLE_RATE), 160 mAudioSamplesPerChunk(mEndpointer.FrameSize()), 161 mSpeechDetectionTimer(NS_NewTimer()), 162 mSpeechGrammarList(new SpeechGrammarList(GetOwnerGlobal())), 163 mContinuous(false), 164 mInterimResults(false), 165 mMaxAlternatives(1) { 166 SR_LOG("created SpeechRecognition"); 167 168 if (StaticPrefs::media_webspeech_test_enable()) { 169 nsCOMPtr<nsIObserverService> obs = services::GetObserverService(); 170 obs->AddObserver(this, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC, false); 171 obs->AddObserver(this, SPEECH_RECOGNITION_TEST_END_TOPIC, false); 172 } 173 174 mEndpointer.set_speech_input_complete_silence_length( 175 Preferences::GetInt(PREFERENCE_ENDPOINTER_SILENCE_LENGTH, 1250000)); 176 mEndpointer.set_long_speech_input_complete_silence_length( 177 Preferences::GetInt(PREFERENCE_ENDPOINTER_LONG_SILENCE_LENGTH, 2500000)); 178 mEndpointer.set_long_speech_length( 179 Preferences::GetInt(PREFERENCE_ENDPOINTER_SILENCE_LENGTH, 3 * 1000000)); 180 181 mSpeechDetectionTimeoutMs = 182 Preferences::GetInt(PREFERENCE_SPEECH_DETECTION_TIMEOUT_MS, 10000); 183 184 Reset(); 185 } 186 187 SpeechRecognition::~SpeechRecognition() = default; 188 189 bool SpeechRecognition::StateBetween(FSMState begin, FSMState end) { 190 return mCurrentState >= begin && mCurrentState <= end; 191 } 192 193 void SpeechRecognition::SetState(FSMState state) { 194 mCurrentState = state; 195 SR_LOG("Transitioned to state %s", GetName(mCurrentState)); 196 } 197 198 JSObject* SpeechRecognition::WrapObject(JSContext* aCx, 199 JS::Handle<JSObject*> aGivenProto) { 200 return SpeechRecognition_Binding::Wrap(aCx, this, aGivenProto); 201 } 202 203 already_AddRefed<SpeechRecognition> SpeechRecognition::Constructor( 204 const GlobalObject& aGlobal, ErrorResult& aRv) { 205 nsCOMPtr<nsPIDOMWindowInner> win = do_QueryInterface(aGlobal.GetAsSupports()); 206 if (!win) { 207 aRv.Throw(NS_ERROR_FAILURE); 208 return nullptr; 209 } 210 211 RefPtr<SpeechRecognition> object = new SpeechRecognition(win); 212 return object.forget(); 213 } 214 215 void SpeechRecognition::ProcessEvent(SpeechEvent* aEvent) { 216 SR_LOG("Processing %s, current state is %s", GetName(aEvent), 217 GetName(mCurrentState)); 218 219 if (mAborted && aEvent->mType != EVENT_ABORT) { 220 // ignore all events while aborting 221 return; 222 } 223 224 Transition(aEvent); 225 } 226 227 void SpeechRecognition::Transition(SpeechEvent* aEvent) { 228 switch (mCurrentState) { 229 case STATE_IDLE: 230 switch (aEvent->mType) { 231 case EVENT_START: 232 // TODO: may want to time out if we wait too long 233 // for user to approve 234 WaitForAudioData(aEvent); 235 break; 236 case EVENT_STOP: 237 case EVENT_ABORT: 238 case EVENT_AUDIO_DATA: 239 case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT: 240 case EVENT_RECOGNITIONSERVICE_FINAL_RESULT: 241 DoNothing(aEvent); 242 break; 243 case EVENT_AUDIO_ERROR: 244 case EVENT_RECOGNITIONSERVICE_ERROR: 245 AbortError(aEvent); 246 break; 247 default: 248 MOZ_CRASH("Invalid event"); 249 } 250 break; 251 case STATE_STARTING: 252 switch (aEvent->mType) { 253 case EVENT_AUDIO_DATA: 254 StartedAudioCapture(aEvent); 255 break; 256 case EVENT_AUDIO_ERROR: 257 case EVENT_RECOGNITIONSERVICE_ERROR: 258 AbortError(aEvent); 259 break; 260 case EVENT_ABORT: 261 AbortSilently(aEvent); 262 break; 263 case EVENT_STOP: 264 ResetAndEnd(); 265 break; 266 case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT: 267 case EVENT_RECOGNITIONSERVICE_FINAL_RESULT: 268 DoNothing(aEvent); 269 break; 270 case EVENT_START: 271 SR_LOG("STATE_STARTING: Unhandled event %s", GetName(aEvent)); 272 MOZ_CRASH(); 273 default: 274 MOZ_CRASH("Invalid event"); 275 } 276 break; 277 case STATE_ESTIMATING: 278 switch (aEvent->mType) { 279 case EVENT_AUDIO_DATA: 280 WaitForEstimation(aEvent); 281 break; 282 case EVENT_STOP: 283 StopRecordingAndRecognize(aEvent); 284 break; 285 case EVENT_ABORT: 286 AbortSilently(aEvent); 287 break; 288 case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT: 289 case EVENT_RECOGNITIONSERVICE_FINAL_RESULT: 290 case EVENT_RECOGNITIONSERVICE_ERROR: 291 DoNothing(aEvent); 292 break; 293 case EVENT_AUDIO_ERROR: 294 AbortError(aEvent); 295 break; 296 case EVENT_START: 297 SR_LOG("STATE_ESTIMATING: Unhandled event %d", aEvent->mType); 298 MOZ_CRASH(); 299 default: 300 MOZ_CRASH("Invalid event"); 301 } 302 break; 303 case STATE_WAITING_FOR_SPEECH: 304 switch (aEvent->mType) { 305 case EVENT_AUDIO_DATA: 306 DetectSpeech(aEvent); 307 break; 308 case EVENT_STOP: 309 StopRecordingAndRecognize(aEvent); 310 break; 311 case EVENT_ABORT: 312 AbortSilently(aEvent); 313 break; 314 case EVENT_AUDIO_ERROR: 315 AbortError(aEvent); 316 break; 317 case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT: 318 case EVENT_RECOGNITIONSERVICE_FINAL_RESULT: 319 case EVENT_RECOGNITIONSERVICE_ERROR: 320 DoNothing(aEvent); 321 break; 322 case EVENT_START: 323 SR_LOG("STATE_STARTING: Unhandled event %s", GetName(aEvent)); 324 MOZ_CRASH(); 325 default: 326 MOZ_CRASH("Invalid event"); 327 } 328 break; 329 case STATE_RECOGNIZING: 330 switch (aEvent->mType) { 331 case EVENT_AUDIO_DATA: 332 WaitForSpeechEnd(aEvent); 333 break; 334 case EVENT_STOP: 335 StopRecordingAndRecognize(aEvent); 336 break; 337 case EVENT_AUDIO_ERROR: 338 case EVENT_RECOGNITIONSERVICE_ERROR: 339 AbortError(aEvent); 340 break; 341 case EVENT_ABORT: 342 AbortSilently(aEvent); 343 break; 344 case EVENT_RECOGNITIONSERVICE_FINAL_RESULT: 345 case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT: 346 DoNothing(aEvent); 347 break; 348 case EVENT_START: 349 SR_LOG("STATE_RECOGNIZING: Unhandled aEvent %s", GetName(aEvent)); 350 MOZ_CRASH(); 351 default: 352 MOZ_CRASH("Invalid event"); 353 } 354 break; 355 case STATE_WAITING_FOR_RESULT: 356 switch (aEvent->mType) { 357 case EVENT_STOP: 358 DoNothing(aEvent); 359 break; 360 case EVENT_AUDIO_ERROR: 361 case EVENT_RECOGNITIONSERVICE_ERROR: 362 AbortError(aEvent); 363 break; 364 case EVENT_RECOGNITIONSERVICE_FINAL_RESULT: 365 NotifyFinalResult(aEvent); 366 break; 367 case EVENT_AUDIO_DATA: 368 DoNothing(aEvent); 369 break; 370 case EVENT_ABORT: 371 AbortSilently(aEvent); 372 break; 373 case EVENT_START: 374 case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT: 375 SR_LOG("STATE_WAITING_FOR_RESULT: Unhandled aEvent %s", 376 GetName(aEvent)); 377 MOZ_CRASH(); 378 default: 379 MOZ_CRASH("Invalid event"); 380 } 381 break; 382 case STATE_ABORTING: 383 switch (aEvent->mType) { 384 case EVENT_STOP: 385 case EVENT_ABORT: 386 case EVENT_AUDIO_DATA: 387 case EVENT_AUDIO_ERROR: 388 case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT: 389 case EVENT_RECOGNITIONSERVICE_FINAL_RESULT: 390 case EVENT_RECOGNITIONSERVICE_ERROR: 391 DoNothing(aEvent); 392 break; 393 case EVENT_START: 394 SR_LOG("STATE_ABORTING: Unhandled aEvent %s", GetName(aEvent)); 395 MOZ_CRASH(); 396 default: 397 MOZ_CRASH("Invalid event"); 398 } 399 break; 400 default: 401 MOZ_CRASH("Invalid state"); 402 } 403 } 404 405 /* 406 * Handle a segment of recorded audio data. 407 * Returns the number of samples that were processed. 408 */ 409 uint32_t SpeechRecognition::ProcessAudioSegment(AudioSegment* aSegment, 410 TrackRate aTrackRate) { 411 AudioSegment::ChunkIterator iterator(*aSegment); 412 uint32_t samples = 0; 413 while (!iterator.IsEnded()) { 414 float out; 415 mEndpointer.ProcessAudio(*iterator, &out); 416 samples += iterator->GetDuration(); 417 iterator.Next(); 418 } 419 420 // we need to call the nsISpeechRecognitionService::ProcessAudioSegment 421 // in a separate thread so that any eventual encoding or pre-processing 422 // of the audio does not block the main thread 423 nsresult rv = mEncodeTaskQueue->Dispatch(NS_NewRunnableFunction( 424 "nsISpeechRecognitionService::ProcessAudioSegment", 425 [=, service = mRecognitionService, 426 segment = std::move(*aSegment)]() mutable { 427 service->ProcessAudioSegment(&segment, aTrackRate); 428 })); 429 430 MOZ_DIAGNOSTIC_ASSERT(NS_SUCCEEDED(rv)); 431 (void)rv; 432 return samples; 433 } 434 435 /**************************************************************************** 436 * FSM Transition functions 437 * 438 * If a transition function may cause a DOM event to be fired, 439 * it may also be re-entered, since the event handler may cause the 440 * event loop to spin and new SpeechEvents to be processed. 441 * 442 * Rules: 443 * 1) These methods should call SetState as soon as possible. 444 * 2) If these methods dispatch DOM events, or call methods that dispatch 445 * DOM events, that should be done as late as possible. 446 * 3) If anything must happen after dispatching a DOM event, make sure 447 * the state is still what the method expected it to be. 448 ****************************************************************************/ 449 450 void SpeechRecognition::Reset() { 451 SetState(STATE_IDLE); 452 453 // This breaks potential ref-cycles. 454 mRecognitionService = nullptr; 455 456 ++mStreamGeneration; 457 if (mStream) { 458 mStream->UnregisterTrackListener(mListener); 459 mStream = nullptr; 460 mListener = nullptr; 461 } 462 mTrack = nullptr; 463 mTrackIsOwned = false; 464 mStopRecordingPromise = nullptr; 465 mEncodeTaskQueue = nullptr; 466 mEstimationSamples = 0; 467 mBufferedSamples = 0; 468 mSpeechDetectionTimer->Cancel(); 469 mAborted = false; 470 } 471 472 void SpeechRecognition::ResetAndEnd() { 473 Reset(); 474 DispatchTrustedEvent(u"end"_ns); 475 } 476 477 void SpeechRecognition::WaitForAudioData(SpeechEvent* aEvent) { 478 SetState(STATE_STARTING); 479 } 480 481 void SpeechRecognition::StartedAudioCapture(SpeechEvent* aEvent) { 482 SetState(STATE_ESTIMATING); 483 484 mEndpointer.SetEnvironmentEstimationMode(); 485 mEstimationSamples += 486 ProcessAudioSegment(aEvent->mAudioSegment, aEvent->mTrackRate); 487 488 DispatchTrustedEvent(u"audiostart"_ns); 489 if (mCurrentState == STATE_ESTIMATING) { 490 DispatchTrustedEvent(u"start"_ns); 491 } 492 } 493 494 void SpeechRecognition::StopRecordingAndRecognize(SpeechEvent* aEvent) { 495 SetState(STATE_WAITING_FOR_RESULT); 496 497 MOZ_ASSERT(mRecognitionService, "Service deleted before recording done"); 498 499 // This will run SoundEnd on the service just before StopRecording begins 500 // shutting the encode thread down. 501 mSpeechListener->mRemovedPromise->Then( 502 GetCurrentSerialEventTarget(), __func__, 503 [service = mRecognitionService] { service->SoundEnd(); }); 504 505 StopRecording(); 506 } 507 508 void SpeechRecognition::WaitForEstimation(SpeechEvent* aEvent) { 509 SetState(STATE_ESTIMATING); 510 511 mEstimationSamples += 512 ProcessAudioSegment(aEvent->mAudioSegment, aEvent->mTrackRate); 513 if (mEstimationSamples > kESTIMATION_SAMPLES) { 514 mEndpointer.SetUserInputMode(); 515 SetState(STATE_WAITING_FOR_SPEECH); 516 } 517 } 518 519 void SpeechRecognition::DetectSpeech(SpeechEvent* aEvent) { 520 SetState(STATE_WAITING_FOR_SPEECH); 521 522 ProcessAudioSegment(aEvent->mAudioSegment, aEvent->mTrackRate); 523 if (mEndpointer.DidStartReceivingSpeech()) { 524 mSpeechDetectionTimer->Cancel(); 525 SetState(STATE_RECOGNIZING); 526 DispatchTrustedEvent(u"speechstart"_ns); 527 } 528 } 529 530 void SpeechRecognition::WaitForSpeechEnd(SpeechEvent* aEvent) { 531 SetState(STATE_RECOGNIZING); 532 533 ProcessAudioSegment(aEvent->mAudioSegment, aEvent->mTrackRate); 534 if (mEndpointer.speech_input_complete()) { 535 DispatchTrustedEvent(u"speechend"_ns); 536 537 if (mCurrentState == STATE_RECOGNIZING) { 538 // FIXME: StopRecordingAndRecognize should only be called for single 539 // shot services for continuous we should just inform the service 540 StopRecordingAndRecognize(aEvent); 541 } 542 } 543 } 544 545 void SpeechRecognition::NotifyFinalResult(SpeechEvent* aEvent) { 546 ResetAndEnd(); 547 548 RootedDictionary<SpeechRecognitionEventInit> init(RootingCx()); 549 init.mBubbles = true; 550 init.mCancelable = false; 551 // init.mResultIndex = 0; 552 init.mResults = aEvent->mRecognitionResultList; 553 init.mInterpretation = JS::NullValue(); 554 // init.mEmma = nullptr; 555 556 RefPtr<SpeechRecognitionEvent> event = 557 SpeechRecognitionEvent::Constructor(this, u"result"_ns, init); 558 event->SetTrusted(true); 559 560 DispatchEvent(*event); 561 } 562 563 void SpeechRecognition::DoNothing(SpeechEvent* aEvent) {} 564 565 void SpeechRecognition::AbortSilently(SpeechEvent* aEvent) { 566 if (mRecognitionService) { 567 if (mTrack) { 568 // This will run Abort on the service just before StopRecording begins 569 // shutting the encode thread down. 570 mSpeechListener->mRemovedPromise->Then( 571 GetCurrentSerialEventTarget(), __func__, 572 [service = mRecognitionService] { service->Abort(); }); 573 } else { 574 // Recording hasn't started yet. We can just call Abort(). 575 mRecognitionService->Abort(); 576 } 577 } 578 579 StopRecording()->Then( 580 GetCurrentSerialEventTarget(), __func__, 581 [self = RefPtr<SpeechRecognition>(this), this] { ResetAndEnd(); }); 582 583 SetState(STATE_ABORTING); 584 } 585 586 void SpeechRecognition::AbortError(SpeechEvent* aEvent) { 587 AbortSilently(aEvent); 588 NotifyError(aEvent); 589 } 590 591 void SpeechRecognition::NotifyError(SpeechEvent* aEvent) { 592 aEvent->mError->SetTrusted(true); 593 594 DispatchEvent(*aEvent->mError); 595 } 596 597 /************************************** 598 * Event triggers and other functions * 599 **************************************/ 600 NS_IMETHODIMP 601 SpeechRecognition::StartRecording(RefPtr<AudioStreamTrack>& aTrack) { 602 // hold a reference so that the underlying track doesn't get collected. 603 mTrack = aTrack; 604 MOZ_ASSERT(!mTrack->Ended()); 605 606 mSpeechListener = SpeechTrackListener::Create(this); 607 mTrack->AddListener(mSpeechListener); 608 609 nsString blockerName; 610 blockerName.AppendPrintf("SpeechRecognition %p shutdown", this); 611 mShutdownBlocker = 612 MakeAndAddRef<SpeechRecognitionShutdownBlocker>(this, blockerName); 613 media::MustGetShutdownBarrier()->AddBlocker( 614 mShutdownBlocker, NS_LITERAL_STRING_FROM_CSTRING(__FILE__), __LINE__, 615 u"SpeechRecognition shutdown"_ns); 616 617 mEndpointer.StartSession(); 618 619 return mSpeechDetectionTimer->Init(this, mSpeechDetectionTimeoutMs, 620 nsITimer::TYPE_ONE_SHOT); 621 } 622 623 RefPtr<GenericNonExclusivePromise> SpeechRecognition::StopRecording() { 624 if (!mTrack) { 625 // Recording wasn't started, or has already been stopped. 626 if (mStream) { 627 // Ensure we don't start recording because a track became available 628 // before we get reset. 629 mStream->UnregisterTrackListener(mListener); 630 mListener = nullptr; 631 } 632 return GenericNonExclusivePromise::CreateAndResolve(true, __func__); 633 } 634 635 if (mStopRecordingPromise) { 636 return mStopRecordingPromise; 637 } 638 639 mTrack->RemoveListener(mSpeechListener); 640 if (mTrackIsOwned) { 641 mTrack->Stop(); 642 } 643 644 mEndpointer.EndSession(); 645 DispatchTrustedEvent(u"audioend"_ns); 646 647 // Block shutdown until the speech track listener has been removed from the 648 // MSG, as it holds a reference to us, and we reference the world, which we 649 // don't want to leak. 650 mStopRecordingPromise = 651 mSpeechListener->mRemovedPromise 652 ->Then( 653 GetCurrentSerialEventTarget(), __func__, 654 [self = RefPtr<SpeechRecognition>(this), this] { 655 SR_LOG("Shutting down encoding thread"); 656 return mEncodeTaskQueue->BeginShutdown(); 657 }, 658 [] { 659 MOZ_CRASH("Unexpected rejection"); 660 return ShutdownPromise::CreateAndResolve(false, __func__); 661 }) 662 ->Then( 663 GetCurrentSerialEventTarget(), __func__, 664 [self = RefPtr<SpeechRecognition>(this), this] { 665 media::MustGetShutdownBarrier()->RemoveBlocker( 666 mShutdownBlocker); 667 mShutdownBlocker = nullptr; 668 669 MOZ_DIAGNOSTIC_ASSERT(mCurrentState != STATE_IDLE); 670 return GenericNonExclusivePromise::CreateAndResolve(true, 671 __func__); 672 }, 673 [] { 674 MOZ_CRASH("Unexpected rejection"); 675 return GenericNonExclusivePromise::CreateAndResolve(false, 676 __func__); 677 }); 678 return mStopRecordingPromise; 679 } 680 681 NS_IMETHODIMP 682 SpeechRecognition::Observe(nsISupports* aSubject, const char* aTopic, 683 const char16_t* aData) { 684 MOZ_ASSERT(NS_IsMainThread(), "Observer invoked off the main thread"); 685 686 if (!strcmp(aTopic, NS_TIMER_CALLBACK_TOPIC) && 687 StateBetween(STATE_IDLE, STATE_WAITING_FOR_SPEECH)) { 688 DispatchError(SpeechRecognition::EVENT_AUDIO_ERROR, 689 SpeechRecognitionErrorCode::No_speech, 690 "No speech detected (timeout)"); 691 } else if (!strcmp(aTopic, SPEECH_RECOGNITION_TEST_END_TOPIC)) { 692 nsCOMPtr<nsIObserverService> obs = services::GetObserverService(); 693 obs->RemoveObserver(this, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC); 694 obs->RemoveObserver(this, SPEECH_RECOGNITION_TEST_END_TOPIC); 695 } else if (StaticPrefs::media_webspeech_test_fake_fsm_events() && 696 !strcmp(aTopic, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC)) { 697 ProcessTestEventRequest(aSubject, nsDependentString(aData)); 698 } 699 700 return NS_OK; 701 } 702 703 void SpeechRecognition::ProcessTestEventRequest(nsISupports* aSubject, 704 const nsAString& aEventName) { 705 if (aEventName.EqualsLiteral("EVENT_ABORT")) { 706 Abort(); 707 } else if (aEventName.EqualsLiteral("EVENT_AUDIO_ERROR")) { 708 DispatchError( 709 SpeechRecognition::EVENT_AUDIO_ERROR, 710 SpeechRecognitionErrorCode::Audio_capture, // TODO different codes? 711 "AUDIO_ERROR test event"); 712 } else { 713 NS_ASSERTION(StaticPrefs::media_webspeech_test_fake_recognition_service(), 714 "Got request for fake recognition service event, but " 715 "media.webspeech.test.fake_recognition_service is unset"); 716 717 // let the fake recognition service handle the request 718 } 719 } 720 721 already_AddRefed<SpeechGrammarList> SpeechRecognition::Grammars() const { 722 RefPtr<SpeechGrammarList> speechGrammarList = mSpeechGrammarList; 723 return speechGrammarList.forget(); 724 } 725 726 void SpeechRecognition::SetGrammars(SpeechGrammarList& aArg) { 727 mSpeechGrammarList = &aArg; 728 } 729 730 void SpeechRecognition::GetLang(nsString& aRetVal) const { aRetVal = mLang; } 731 732 void SpeechRecognition::SetLang(const nsAString& aArg) { mLang = aArg; } 733 734 bool SpeechRecognition::GetContinuous(ErrorResult& aRv) const { 735 return mContinuous; 736 } 737 738 void SpeechRecognition::SetContinuous(bool aArg, ErrorResult& aRv) { 739 mContinuous = aArg; 740 } 741 742 bool SpeechRecognition::InterimResults() const { return mInterimResults; } 743 744 void SpeechRecognition::SetInterimResults(bool aArg) { mInterimResults = aArg; } 745 746 uint32_t SpeechRecognition::MaxAlternatives() const { return mMaxAlternatives; } 747 748 void SpeechRecognition::SetMaxAlternatives(uint32_t aArg) { 749 mMaxAlternatives = aArg; 750 } 751 752 void SpeechRecognition::GetServiceURI(nsString& aRetVal, 753 ErrorResult& aRv) const { 754 aRv.Throw(NS_ERROR_NOT_IMPLEMENTED); 755 } 756 757 void SpeechRecognition::SetServiceURI(const nsAString& aArg, ErrorResult& aRv) { 758 aRv.Throw(NS_ERROR_NOT_IMPLEMENTED); 759 } 760 761 void SpeechRecognition::Start(const Optional<NonNull<DOMMediaStream>>& aStream, 762 CallerType aCallerType, ErrorResult& aRv) { 763 if (mCurrentState != STATE_IDLE) { 764 aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR); 765 return; 766 } 767 768 if (!SetRecognitionService(aRv)) { 769 return; 770 } 771 772 if (!ValidateAndSetGrammarList(aRv)) { 773 return; 774 } 775 776 mEncodeTaskQueue = 777 TaskQueue::Create(GetMediaThreadPool(MediaThreadType::WEBRTC_WORKER), 778 "WebSpeechEncoderThread"); 779 780 nsresult rv; 781 rv = mRecognitionService->Initialize(this); 782 if (NS_WARN_IF(NS_FAILED(rv))) { 783 return; 784 } 785 786 MediaStreamConstraints constraints; 787 constraints.mAudio.SetAsBoolean() = true; 788 789 MOZ_ASSERT(!mListener); 790 mListener = new TrackListener(this); 791 792 if (aStream.WasPassed()) { 793 mStream = &aStream.Value(); 794 mTrackIsOwned = false; 795 mStream->RegisterTrackListener(mListener); 796 nsTArray<RefPtr<AudioStreamTrack>> tracks; 797 mStream->GetAudioTracks(tracks); 798 for (const RefPtr<AudioStreamTrack>& track : tracks) { 799 if (!track->Ended()) { 800 NotifyTrackAdded(track); 801 break; 802 } 803 } 804 } else { 805 mTrackIsOwned = true; 806 nsPIDOMWindowInner* win = GetOwnerWindow(); 807 if (!win || !win->IsFullyActive()) { 808 aRv.ThrowInvalidStateError("The document is not fully active."); 809 return; 810 } 811 AutoNoJSAPI nojsapi; 812 RefPtr<SpeechRecognition> self(this); 813 MediaManager::Get() 814 ->GetUserMedia(win, constraints, aCallerType) 815 ->Then( 816 GetCurrentSerialEventTarget(), __func__, 817 [this, self, 818 generation = mStreamGeneration](RefPtr<DOMMediaStream>&& aStream) { 819 nsTArray<RefPtr<AudioStreamTrack>> tracks; 820 aStream->GetAudioTracks(tracks); 821 if (mAborted || mCurrentState != STATE_STARTING || 822 mStreamGeneration != generation) { 823 // We were probably aborted. Exit early. 824 for (const RefPtr<AudioStreamTrack>& track : tracks) { 825 track->Stop(); 826 } 827 return; 828 } 829 mStream = std::move(aStream); 830 mStream->RegisterTrackListener(mListener); 831 for (const RefPtr<AudioStreamTrack>& track : tracks) { 832 if (!track->Ended()) { 833 NotifyTrackAdded(track); 834 } 835 } 836 }, 837 [this, self, 838 generation = mStreamGeneration](RefPtr<MediaMgrError>&& error) { 839 if (mAborted || mCurrentState != STATE_STARTING || 840 mStreamGeneration != generation) { 841 // We were probably aborted. Exit early. 842 return; 843 } 844 SpeechRecognitionErrorCode errorCode; 845 846 if (error->mName == MediaMgrError::Name::NotAllowedError) { 847 errorCode = SpeechRecognitionErrorCode::Not_allowed; 848 } else { 849 errorCode = SpeechRecognitionErrorCode::Audio_capture; 850 } 851 DispatchError(SpeechRecognition::EVENT_AUDIO_ERROR, errorCode, 852 error->mMessage); 853 }); 854 } 855 856 RefPtr<SpeechEvent> event = new SpeechEvent(this, EVENT_START); 857 NS_DispatchToMainThread(event); 858 } 859 860 bool SpeechRecognition::SetRecognitionService(ErrorResult& aRv) { 861 if (!GetOwnerWindow()) { 862 aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR); 863 return false; 864 } 865 866 // See: 867 // https://dvcs.w3.org/hg/speech-api/raw-file/tip/webspeechapi.html#dfn-lang 868 nsAutoString lang; 869 if (!mLang.IsEmpty()) { 870 lang = mLang; 871 } else { 872 nsCOMPtr<Document> document = GetOwnerWindow()->GetExtantDoc(); 873 if (!document) { 874 aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR); 875 return false; 876 } 877 nsCOMPtr<Element> element = document->GetRootElement(); 878 if (!element) { 879 aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR); 880 return false; 881 } 882 883 nsAutoString lang; 884 element->GetLang(lang); 885 } 886 887 auto result = CreateSpeechRecognitionService(GetOwnerWindow(), this, lang); 888 889 if (result.isErr()) { 890 switch (result.unwrapErr()) { 891 case ServiceCreationError::ServiceNotFound: 892 aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR); 893 break; 894 default: 895 MOZ_CRASH("Unknown error"); 896 } 897 return false; 898 } 899 900 mRecognitionService = result.unwrap(); 901 MOZ_DIAGNOSTIC_ASSERT(mRecognitionService); 902 return true; 903 } 904 905 bool SpeechRecognition::ValidateAndSetGrammarList(ErrorResult& aRv) { 906 if (!mSpeechGrammarList) { 907 aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR); 908 return false; 909 } 910 911 uint32_t grammarListLength = mSpeechGrammarList->Length(); 912 for (uint32_t count = 0; count < grammarListLength; ++count) { 913 RefPtr<SpeechGrammar> speechGrammar = mSpeechGrammarList->Item(count, aRv); 914 if (aRv.Failed()) { 915 return false; 916 } 917 if (NS_FAILED(mRecognitionService->ValidateAndSetGrammarList( 918 speechGrammar.get(), nullptr))) { 919 aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR); 920 return false; 921 } 922 } 923 924 return true; 925 } 926 927 void SpeechRecognition::Stop() { 928 RefPtr<SpeechEvent> event = new SpeechEvent(this, EVENT_STOP); 929 NS_DispatchToMainThread(event); 930 } 931 932 void SpeechRecognition::Abort() { 933 if (mAborted) { 934 return; 935 } 936 937 mAborted = true; 938 939 RefPtr<SpeechEvent> event = new SpeechEvent(this, EVENT_ABORT); 940 NS_DispatchToMainThread(event); 941 } 942 943 void SpeechRecognition::NotifyTrackAdded( 944 const RefPtr<MediaStreamTrack>& aTrack) { 945 if (mTrack) { 946 return; 947 } 948 949 RefPtr<AudioStreamTrack> audioTrack = aTrack->AsAudioStreamTrack(); 950 if (!audioTrack) { 951 return; 952 } 953 954 if (audioTrack->Ended()) { 955 return; 956 } 957 958 StartRecording(audioTrack); 959 } 960 961 void SpeechRecognition::DispatchError(EventType aErrorType, 962 SpeechRecognitionErrorCode aErrorCode, 963 const nsACString& aMessage) { 964 MOZ_ASSERT(NS_IsMainThread()); 965 MOZ_ASSERT(aErrorType == EVENT_RECOGNITIONSERVICE_ERROR || 966 aErrorType == EVENT_AUDIO_ERROR, 967 "Invalid error type!"); 968 969 RefPtr<SpeechRecognitionError> srError = 970 new SpeechRecognitionError(nullptr, nullptr, nullptr); 971 972 srError->InitSpeechRecognitionError(u"error"_ns, true, false, aErrorCode, 973 aMessage); 974 975 RefPtr<SpeechEvent> event = new SpeechEvent(this, aErrorType); 976 event->mError = srError; 977 NS_DispatchToMainThread(event); 978 } 979 980 /* 981 * Buffer audio samples into mAudioSamplesBuffer until aBufferSize. 982 * Updates mBufferedSamples and returns the number of samples that were 983 * buffered. 984 */ 985 uint32_t SpeechRecognition::FillSamplesBuffer(const int16_t* aSamples, 986 uint32_t aSampleCount) { 987 MOZ_ASSERT(mBufferedSamples < mAudioSamplesPerChunk); 988 MOZ_ASSERT(mAudioSamplesBuffer); 989 990 int16_t* samplesBuffer = static_cast<int16_t*>(mAudioSamplesBuffer->Data()); 991 size_t samplesToCopy = 992 std::min(aSampleCount, mAudioSamplesPerChunk - mBufferedSamples); 993 994 PodCopy(samplesBuffer + mBufferedSamples, aSamples, samplesToCopy); 995 996 mBufferedSamples += samplesToCopy; 997 return samplesToCopy; 998 } 999 1000 /* 1001 * Split a samples buffer starting of a given size into 1002 * chunks of equal size. The chunks are stored in the array 1003 * received as argument. 1004 * Returns the offset of the end of the last chunk that was 1005 * created. 1006 */ 1007 uint32_t SpeechRecognition::SplitSamplesBuffer( 1008 const int16_t* aSamplesBuffer, uint32_t aSampleCount, 1009 nsTArray<RefPtr<SharedBuffer>>& aResult) { 1010 uint32_t chunkStart = 0; 1011 1012 while (chunkStart + mAudioSamplesPerChunk <= aSampleCount) { 1013 CheckedInt<size_t> bufferSize(sizeof(int16_t)); 1014 bufferSize *= mAudioSamplesPerChunk; 1015 RefPtr<SharedBuffer> chunk = SharedBuffer::Create(bufferSize); 1016 1017 PodCopy(static_cast<short*>(chunk->Data()), aSamplesBuffer + chunkStart, 1018 mAudioSamplesPerChunk); 1019 1020 aResult.AppendElement(chunk.forget()); 1021 chunkStart += mAudioSamplesPerChunk; 1022 } 1023 1024 return chunkStart; 1025 } 1026 1027 AudioSegment* SpeechRecognition::CreateAudioSegment( 1028 nsTArray<RefPtr<SharedBuffer>>& aChunks) { 1029 AudioSegment* segment = new AudioSegment(); 1030 for (uint32_t i = 0; i < aChunks.Length(); ++i) { 1031 RefPtr<SharedBuffer> buffer = aChunks[i]; 1032 const int16_t* chunkData = static_cast<const int16_t*>(buffer->Data()); 1033 1034 AutoTArray<const int16_t*, 1> channels; 1035 channels.AppendElement(chunkData); 1036 segment->AppendFrames(buffer.forget(), channels, mAudioSamplesPerChunk, 1037 PRINCIPAL_HANDLE_NONE); 1038 } 1039 1040 return segment; 1041 } 1042 1043 void SpeechRecognition::FeedAudioData( 1044 nsMainThreadPtrHandle<SpeechRecognition>& aRecognition, 1045 already_AddRefed<SharedBuffer> aSamples, uint32_t aDuration, 1046 MediaTrackListener* aProvider, TrackRate aTrackRate) { 1047 NS_ASSERTION(!NS_IsMainThread(), 1048 "FeedAudioData should not be called in the main thread"); 1049 1050 // Endpointer expects to receive samples in chunks whose size is a 1051 // multiple of its frame size. 1052 // Since we can't assume we will receive the frames in appropriate-sized 1053 // chunks, we must buffer and split them in chunks of mAudioSamplesPerChunk 1054 // (a multiple of Endpointer's frame size) before feeding to Endpointer. 1055 1056 // ensure aSamples is deleted 1057 RefPtr<SharedBuffer> refSamples = aSamples; 1058 1059 uint32_t samplesIndex = 0; 1060 const int16_t* samples = static_cast<int16_t*>(refSamples->Data()); 1061 AutoTArray<RefPtr<SharedBuffer>, 5> chunksToSend; 1062 1063 // fill up our buffer and make a chunk out of it, if possible 1064 if (mBufferedSamples > 0) { 1065 samplesIndex += FillSamplesBuffer(samples, aDuration); 1066 1067 if (mBufferedSamples == mAudioSamplesPerChunk) { 1068 chunksToSend.AppendElement(mAudioSamplesBuffer.forget()); 1069 mBufferedSamples = 0; 1070 } 1071 } 1072 1073 // create sample chunks of correct size 1074 if (samplesIndex < aDuration) { 1075 samplesIndex += SplitSamplesBuffer(samples + samplesIndex, 1076 aDuration - samplesIndex, chunksToSend); 1077 } 1078 1079 // buffer remaining samples 1080 if (samplesIndex < aDuration) { 1081 mBufferedSamples = 0; 1082 CheckedInt<size_t> bufferSize(sizeof(int16_t)); 1083 bufferSize *= mAudioSamplesPerChunk; 1084 mAudioSamplesBuffer = SharedBuffer::Create(bufferSize); 1085 1086 FillSamplesBuffer(samples + samplesIndex, aDuration - samplesIndex); 1087 } 1088 1089 AudioSegment* segment = CreateAudioSegment(chunksToSend); 1090 RefPtr<SpeechEvent> event = new SpeechEvent(aRecognition, EVENT_AUDIO_DATA); 1091 event->mAudioSegment = segment; 1092 event->mProvider = aProvider; 1093 event->mTrackRate = aTrackRate; 1094 NS_DispatchToMainThread(event); 1095 } 1096 1097 const char* SpeechRecognition::GetName(FSMState aId) { 1098 static const char* names[] = { 1099 "STATE_IDLE", "STATE_STARTING", 1100 "STATE_ESTIMATING", "STATE_WAITING_FOR_SPEECH", 1101 "STATE_RECOGNIZING", "STATE_WAITING_FOR_RESULT", 1102 "STATE_ABORTING", 1103 }; 1104 1105 MOZ_ASSERT(aId < STATE_COUNT); 1106 MOZ_ASSERT(std::size(names) == STATE_COUNT); 1107 return names[aId]; 1108 } 1109 1110 const char* SpeechRecognition::GetName(SpeechEvent* aEvent) { 1111 static const char* names[] = {"EVENT_START", 1112 "EVENT_STOP", 1113 "EVENT_ABORT", 1114 "EVENT_AUDIO_DATA", 1115 "EVENT_AUDIO_ERROR", 1116 "EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT", 1117 "EVENT_RECOGNITIONSERVICE_FINAL_RESULT", 1118 "EVENT_RECOGNITIONSERVICE_ERROR"}; 1119 1120 MOZ_ASSERT(aEvent->mType < EVENT_COUNT); 1121 MOZ_ASSERT(std::size(names) == EVENT_COUNT); 1122 return names[aEvent->mType]; 1123 } 1124 1125 TaskQueue* SpeechRecognition::GetTaskQueueForEncoding() const { 1126 MOZ_ASSERT(NS_IsMainThread()); 1127 return mEncodeTaskQueue; 1128 } 1129 1130 SpeechEvent::SpeechEvent(SpeechRecognition* aRecognition, 1131 SpeechRecognition::EventType aType) 1132 : Runnable("dom::SpeechEvent"), 1133 mAudioSegment(nullptr), 1134 mRecognitionResultList(nullptr), 1135 mError(nullptr), 1136 mRecognition(new nsMainThreadPtrHolder<SpeechRecognition>( 1137 "SpeechEvent::SpeechEvent", aRecognition)), 1138 mType(aType), 1139 mTrackRate(0) {} 1140 1141 SpeechEvent::SpeechEvent(nsMainThreadPtrHandle<SpeechRecognition>& aRecognition, 1142 SpeechRecognition::EventType aType) 1143 : Runnable("dom::SpeechEvent"), 1144 mAudioSegment(nullptr), 1145 mRecognitionResultList(nullptr), 1146 mError(nullptr), 1147 mRecognition(aRecognition), 1148 mType(aType), 1149 mTrackRate(0) {} 1150 1151 SpeechEvent::~SpeechEvent() { delete mAudioSegment; } 1152 1153 NS_IMETHODIMP 1154 SpeechEvent::Run() { 1155 mRecognition->ProcessEvent(this); 1156 return NS_OK; 1157 } 1158 1159 } // namespace mozilla::dom