OnlineSpeechRecognitionService.cpp (14953B)
1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 /* vim:set ts=2 sw=2 sts=2 et cindent: */ 3 /* This Source Code Form is subject to the terms of the Mozilla Public 4 * License, v. 2.0. If a copy of the MPL was not distributed with this 5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 6 7 #include "OnlineSpeechRecognitionService.h" 8 9 #include <json/json.h> 10 #include <string.h> 11 12 #include "OggWriter.h" 13 #include "OpusTrackEncoder.h" 14 #include "SpeechGrammar.h" 15 #include "SpeechRecognition.h" 16 #include "SpeechRecognitionAlternative.h" 17 #include "SpeechRecognitionResult.h" 18 #include "SpeechRecognitionResultList.h" 19 #include "mozilla/Preferences.h" 20 #include "mozilla/ScopeExit.h" 21 #include "mozilla/dom/Document.h" 22 #include "nsContentUtils.h" 23 #include "nsGlobalWindowInner.h" 24 #include "nsIChannel.h" 25 #include "nsIClassOfService.h" 26 #include "nsIHttpChannel.h" 27 #include "nsIOutputStream.h" 28 #include "nsIPrincipal.h" 29 #include "nsIStreamListener.h" 30 #include "nsIUploadChannel2.h" 31 #include "nsNetUtil.h" 32 #include "nsStringStream.h" 33 #include "nsThreadUtils.h" 34 35 namespace mozilla { 36 37 using namespace dom; 38 39 #define PREFERENCE_DEFAULT_RECOGNITION_ENDPOINT \ 40 "media.webspeech.service.endpoint" 41 #define DEFAULT_RECOGNITION_ENDPOINT "https://speaktome-2.services.mozilla.com/" 42 #define MAX_LISTENING_TIME_MS 10000 43 44 NS_IMPL_ISUPPORTS(OnlineSpeechRecognitionService, nsISpeechRecognitionService, 45 nsIStreamListener) 46 47 NS_IMETHODIMP 48 OnlineSpeechRecognitionService::OnStartRequest(nsIRequest* aRequest) { 49 MOZ_ASSERT(NS_IsMainThread()); 50 return NS_OK; 51 } 52 53 static nsresult AssignResponseToBuffer(nsIInputStream* aIn, void* aClosure, 54 const char* aFromRawSegment, 55 uint32_t aToOffset, uint32_t aCount, 56 uint32_t* aWriteCount) { 57 nsCString* buf = static_cast<nsCString*>(aClosure); 58 buf->Append(aFromRawSegment, aCount); 59 *aWriteCount = aCount; 60 return NS_OK; 61 } 62 63 NS_IMETHODIMP 64 OnlineSpeechRecognitionService::OnDataAvailable(nsIRequest* aRequest, 65 nsIInputStream* aInputStream, 66 uint64_t aOffset, 67 uint32_t aCount) { 68 MOZ_ASSERT(NS_IsMainThread()); 69 nsresult rv; 70 uint32_t readCount; 71 rv = aInputStream->ReadSegments(AssignResponseToBuffer, &mBuf, aCount, 72 &readCount); 73 NS_ENSURE_SUCCESS(rv, rv); 74 return NS_OK; 75 } 76 77 NS_IMETHODIMP 78 OnlineSpeechRecognitionService::OnStopRequest(nsIRequest* aRequest, 79 nsresult aStatusCode) { 80 MOZ_ASSERT(NS_IsMainThread()); 81 82 auto clearBuf = MakeScopeExit([&] { mBuf.Truncate(); }); 83 84 if (mAborted) { 85 return NS_OK; 86 } 87 88 bool success; 89 float confidence = 0; 90 Json::Value root; 91 Json::CharReaderBuilder builder; 92 bool parsingSuccessful; 93 nsAutoCString result; 94 nsAutoCString hypoValue; 95 nsAutoCString errorMsg; 96 SpeechRecognitionErrorCode errorCode; 97 98 SR_LOG("STT Result: %s", mBuf.get()); 99 100 if (NS_FAILED(aStatusCode)) { 101 success = false; 102 errorMsg.AssignLiteral("Error connecting to the service."); 103 errorCode = SpeechRecognitionErrorCode::Network; 104 } else { 105 success = true; 106 UniquePtr<Json::CharReader> const reader(builder.newCharReader()); 107 parsingSuccessful = 108 reader->parse(mBuf.BeginReading(), mBuf.EndReading(), &root, nullptr); 109 if (!parsingSuccessful) { 110 // there's an internal server error 111 success = false; 112 errorMsg.AssignLiteral("Internal server error"); 113 errorCode = SpeechRecognitionErrorCode::Network; 114 } else { 115 result.Assign(root.get("status", "error").asString().c_str()); 116 if (result.EqualsLiteral("ok")) { 117 // ok, we have a result 118 if (!root["data"].empty()) { 119 hypoValue.Assign(root["data"][0].get("text", "").asString().c_str()); 120 confidence = root["data"][0].get("confidence", "0").asFloat(); 121 } else { 122 success = false; 123 errorMsg.AssignLiteral("Error reading result data."); 124 errorCode = SpeechRecognitionErrorCode::Network; 125 } 126 } else { 127 success = false; 128 errorMsg.Assign(root.get("message", "").asString().c_str()); 129 errorCode = SpeechRecognitionErrorCode::No_speech; 130 } 131 } 132 } 133 134 if (!success) { 135 mRecognition->DispatchError( 136 SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR, errorCode, errorMsg); 137 } else { 138 // Declare javascript result events 139 RefPtr<SpeechEvent> event = new SpeechEvent( 140 mRecognition, SpeechRecognition::EVENT_RECOGNITIONSERVICE_FINAL_RESULT); 141 SpeechRecognitionResultList* resultList = 142 new SpeechRecognitionResultList(mRecognition); 143 SpeechRecognitionResult* result = new SpeechRecognitionResult(mRecognition); 144 145 if (mRecognition->MaxAlternatives() > 0) { 146 SpeechRecognitionAlternative* alternative = 147 new SpeechRecognitionAlternative(mRecognition); 148 149 alternative->mTranscript = NS_ConvertUTF8toUTF16(hypoValue); 150 alternative->mConfidence = confidence; 151 152 result->mItems.AppendElement(alternative); 153 } 154 resultList->mItems.AppendElement(result); 155 156 event->mRecognitionResultList = resultList; 157 NS_DispatchToMainThread(event); 158 } 159 160 return NS_OK; 161 } 162 163 OnlineSpeechRecognitionService::OnlineSpeechRecognitionService() = default; 164 OnlineSpeechRecognitionService::~OnlineSpeechRecognitionService() = default; 165 166 NS_IMETHODIMP 167 OnlineSpeechRecognitionService::Initialize( 168 WeakPtr<SpeechRecognition> aSpeechRecognition) { 169 MOZ_ASSERT(NS_IsMainThread()); 170 mWriter = MakeUnique<OggWriter>(); 171 mRecognition = new nsMainThreadPtrHolder<SpeechRecognition>( 172 "OnlineSpeechRecognitionService::mRecognition", aSpeechRecognition); 173 mEncodeTaskQueue = mRecognition->GetTaskQueueForEncoding(); 174 MOZ_ASSERT(mEncodeTaskQueue); 175 return NS_OK; 176 } 177 178 void OnlineSpeechRecognitionService::EncoderFinished() { 179 MOZ_ASSERT(!NS_IsMainThread()); 180 MOZ_ASSERT(mEncodedAudioQueue.IsFinished()); 181 182 while (RefPtr<EncodedFrame> frame = mEncodedAudioQueue.PopFront()) { 183 AutoTArray<RefPtr<EncodedFrame>, 1> frames({frame}); 184 DebugOnly<nsresult> rv = 185 mWriter->WriteEncodedTrack(frames, mEncodedAudioQueue.AtEndOfStream() 186 ? ContainerWriter::END_OF_STREAM 187 : 0); 188 MOZ_ASSERT(NS_SUCCEEDED(rv)); 189 } 190 191 mWriter->GetContainerData(&mEncodedData, ContainerWriter::FLUSH_NEEDED); 192 MOZ_ASSERT(mWriter->IsWritingComplete()); 193 194 NS_DispatchToMainThread( 195 NewRunnableMethod("OnlineSpeechRecognitionService::DoSTT", this, 196 &OnlineSpeechRecognitionService::DoSTT)); 197 } 198 199 void OnlineSpeechRecognitionService::EncoderInitialized() { 200 MOZ_ASSERT(!NS_IsMainThread()); 201 AutoTArray<RefPtr<TrackMetadataBase>, 1> metadata; 202 metadata.AppendElement(mAudioEncoder->GetMetadata()); 203 if (metadata[0]->GetKind() != TrackMetadataBase::METADATA_OPUS) { 204 SR_LOG("wrong meta data type!"); 205 MOZ_ASSERT_UNREACHABLE(); 206 } 207 208 nsresult rv = mWriter->SetMetadata(metadata); 209 MOZ_DIAGNOSTIC_ASSERT(NS_SUCCEEDED(rv)); 210 211 rv = mWriter->GetContainerData(&mEncodedData, ContainerWriter::GET_HEADER); 212 MOZ_DIAGNOSTIC_ASSERT(NS_SUCCEEDED(rv)); 213 214 (void)rv; 215 } 216 217 void OnlineSpeechRecognitionService::EncoderError() { 218 MOZ_ASSERT(!NS_IsMainThread()); 219 SR_LOG("Error encoding frames."); 220 mEncodedData.Clear(); 221 NS_DispatchToMainThread(NS_NewRunnableFunction( 222 "SpeechRecognition::DispatchError", 223 [this, self = RefPtr<OnlineSpeechRecognitionService>(this)]() { 224 if (!mRecognition) { 225 return; 226 } 227 mRecognition->DispatchError( 228 SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR, 229 SpeechRecognitionErrorCode::Audio_capture, "Encoder error"); 230 })); 231 } 232 233 NS_IMETHODIMP 234 OnlineSpeechRecognitionService::ProcessAudioSegment(AudioSegment* aAudioSegment, 235 int32_t aSampleRate) { 236 MOZ_ASSERT(!NS_IsMainThread()); 237 int64_t duration = aAudioSegment->GetDuration(); 238 if (duration <= 0) { 239 return NS_OK; 240 } 241 242 if (!mAudioEncoder) { 243 mSpeechEncoderListener = new SpeechEncoderListener(this); 244 mAudioEncoder = 245 MakeUnique<OpusTrackEncoder>(aSampleRate, mEncodedAudioQueue); 246 RefPtr<AbstractThread> mEncoderThread = AbstractThread::GetCurrent(); 247 mAudioEncoder->SetWorkerThread(mEncoderThread); 248 mAudioEncoder->RegisterListener(mSpeechEncoderListener); 249 } 250 251 mAudioEncoder->AppendAudioSegment(std::move(*aAudioSegment)); 252 253 TimeStamp now = TimeStamp::Now(); 254 if (mFirstIteration.IsNull()) { 255 mFirstIteration = now; 256 } 257 258 if ((now - mFirstIteration).ToMilliseconds() >= MAX_LISTENING_TIME_MS) { 259 NS_DispatchToMainThread(NS_NewRunnableFunction( 260 "SpeechRecognition::Stop", 261 [this, self = RefPtr<OnlineSpeechRecognitionService>(this)]() { 262 if (!mRecognition) { 263 return; 264 } 265 mRecognition->Stop(); 266 })); 267 268 return NS_OK; 269 } 270 271 return NS_OK; 272 } 273 274 void OnlineSpeechRecognitionService::DoSTT() { 275 MOZ_ASSERT(NS_IsMainThread()); 276 277 if (mAborted) { 278 return; 279 } 280 281 nsresult rv; 282 nsCOMPtr<nsIChannel> chan; 283 nsCOMPtr<nsIURI> uri; 284 nsAutoCString speechRecognitionEndpoint; 285 nsAutoCString prefEndpoint; 286 nsAutoString language; 287 288 Preferences::GetCString(PREFERENCE_DEFAULT_RECOGNITION_ENDPOINT, 289 prefEndpoint); 290 291 if (!prefEndpoint.IsEmpty()) { 292 speechRecognitionEndpoint = prefEndpoint; 293 } else { 294 speechRecognitionEndpoint = DEFAULT_RECOGNITION_ENDPOINT; 295 } 296 297 rv = NS_NewURI(getter_AddRefs(uri), speechRecognitionEndpoint, nullptr, 298 nullptr); 299 if (NS_WARN_IF(NS_FAILED(rv))) { 300 mRecognition->DispatchError( 301 SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR, 302 SpeechRecognitionErrorCode::Network, "Unknown URI"); 303 return; 304 } 305 306 nsSecurityFlags secFlags = nsILoadInfo::SEC_REQUIRE_CORS_INHERITS_SEC_CONTEXT; 307 nsLoadFlags loadFlags = 308 nsIRequest::LOAD_NORMAL | nsIChannel::LOAD_BYPASS_SERVICE_WORKER; 309 nsContentPolicyType contentPolicy = nsIContentPolicy::TYPE_OTHER; 310 311 nsGlobalWindowInner* window = mRecognition->GetOwnerWindow(); 312 if (NS_WARN_IF(!window)) { 313 mRecognition->DispatchError( 314 SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR, 315 SpeechRecognitionErrorCode::Aborted, "No window"); 316 return; 317 } 318 319 Document* doc = window->GetExtantDoc(); 320 if (NS_WARN_IF(!doc)) { 321 mRecognition->DispatchError( 322 SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR, 323 SpeechRecognitionErrorCode::Aborted, "No document"); 324 } 325 rv = NS_NewChannel(getter_AddRefs(chan), uri, doc->NodePrincipal(), secFlags, 326 contentPolicy, nullptr, nullptr, nullptr, nullptr, 327 loadFlags); 328 if (NS_WARN_IF(NS_FAILED(rv))) { 329 mRecognition->DispatchError( 330 SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR, 331 SpeechRecognitionErrorCode::Network, "Failed to open channel"); 332 return; 333 } 334 335 nsCOMPtr<nsIHttpChannel> httpChan = do_QueryInterface(chan); 336 if (httpChan) { 337 rv = httpChan->SetRequestMethod("POST"_ns); 338 MOZ_RELEASE_ASSERT(NS_SUCCEEDED(rv)); 339 } 340 341 if (httpChan) { 342 mRecognition->GetLang(language); 343 // Accept-Language-STT is a custom header of our backend server used to set 344 // the language of the speech sample being submitted by the client 345 rv = httpChan->SetRequestHeader("Accept-Language-STT"_ns, 346 NS_ConvertUTF16toUTF8(language), false); 347 MOZ_RELEASE_ASSERT(NS_SUCCEEDED(rv)); 348 // Tell the server to not store the transcription by default 349 rv = httpChan->SetRequestHeader("Store-Transcription"_ns, "0"_ns, false); 350 MOZ_RELEASE_ASSERT(NS_SUCCEEDED(rv)); 351 // Tell the server to not store the sample by default 352 rv = httpChan->SetRequestHeader("Store-Sample"_ns, "0"_ns, false); 353 MOZ_RELEASE_ASSERT(NS_SUCCEEDED(rv)); 354 // Set the product tag as the web speech api 355 rv = httpChan->SetRequestHeader("Product-Tag"_ns, "wsa"_ns, false); 356 MOZ_RELEASE_ASSERT(NS_SUCCEEDED(rv)); 357 } 358 359 nsCOMPtr<nsIClassOfService> cos(do_QueryInterface(chan)); 360 if (cos) { 361 cos->AddClassFlags(nsIClassOfService::UrgentStart); 362 } 363 364 nsCOMPtr<nsIUploadChannel2> uploadChan = do_QueryInterface(chan); 365 if (uploadChan) { 366 nsCOMPtr<nsIInputStream> bodyStream; 367 uint32_t length = 0; 368 for (const nsTArray<uint8_t>& chunk : mEncodedData) { 369 length += chunk.Length(); 370 } 371 372 nsTArray<uint8_t> audio; 373 if (!audio.SetCapacity(length, fallible)) { 374 mRecognition->DispatchError( 375 SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR, 376 SpeechRecognitionErrorCode::Audio_capture, "Allocation error"); 377 return; 378 } 379 380 for (const nsTArray<uint8_t>& chunk : mEncodedData) { 381 audio.AppendElements(chunk); 382 } 383 384 mEncodedData.Clear(); 385 386 rv = NS_NewByteInputStream(getter_AddRefs(bodyStream), std::move(audio)); 387 if (NS_WARN_IF(NS_FAILED(rv))) { 388 mRecognition->DispatchError( 389 SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR, 390 SpeechRecognitionErrorCode::Network, "Failed to open stream"); 391 return; 392 } 393 if (bodyStream) { 394 rv = uploadChan->ExplicitSetUploadStream(bodyStream, "audio/ogg"_ns, 395 length, "POST"_ns, false); 396 MOZ_RELEASE_ASSERT(NS_SUCCEEDED(rv)); 397 } 398 } 399 400 rv = chan->AsyncOpen(this); 401 if (NS_WARN_IF(NS_FAILED(rv))) { 402 mRecognition->DispatchError( 403 SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR, 404 SpeechRecognitionErrorCode::Network, "Internal server error"); 405 } 406 } 407 408 NS_IMETHODIMP 409 OnlineSpeechRecognitionService::SoundEnd() { 410 MOZ_ASSERT(NS_IsMainThread()); 411 412 if (!mEncodeTaskQueue) { 413 // Not initialized 414 return NS_OK; 415 } 416 417 nsresult rv = mEncodeTaskQueue->Dispatch(NS_NewRunnableFunction( 418 "OnlineSpeechRecognitionService::SoundEnd", 419 [this, self = RefPtr<OnlineSpeechRecognitionService>(this)]() { 420 if (mAudioEncoder) { 421 mAudioEncoder->NotifyEndOfStream(); 422 mAudioEncoder->UnregisterListener(mSpeechEncoderListener); 423 mSpeechEncoderListener = nullptr; 424 mAudioEncoder = nullptr; 425 EncoderFinished(); 426 } 427 })); 428 MOZ_DIAGNOSTIC_ASSERT(NS_SUCCEEDED(rv)); 429 (void)rv; 430 431 mEncodeTaskQueue = nullptr; 432 433 return NS_OK; 434 } 435 436 NS_IMETHODIMP 437 OnlineSpeechRecognitionService::ValidateAndSetGrammarList( 438 SpeechGrammar* aSpeechGrammar, 439 nsISpeechGrammarCompilationCallback* aCallback) { 440 // This is an online LVCSR (STT) service, 441 // so we don't need to set a grammar 442 return NS_OK; 443 } 444 445 NS_IMETHODIMP 446 OnlineSpeechRecognitionService::Abort() { 447 MOZ_ASSERT(NS_IsMainThread()); 448 if (mAborted) { 449 return NS_OK; 450 } 451 mAborted = true; 452 return SoundEnd(); 453 } 454 } // namespace mozilla