SapiService.cpp (13327B)
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */ 3 /* This Source Code Form is subject to the terms of the Mozilla Public 4 * License, v. 2.0. If a copy of the MPL was not distributed with this 5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 6 7 #include "SapiService.h" 8 9 #include "mozilla/ClearOnShutdown.h" 10 #include "mozilla/Preferences.h" 11 #include "mozilla/ProfilerLabels.h" 12 #include "mozilla/StaticPrefs_media.h" 13 #include "mozilla/dom/nsSpeechTask.h" 14 #include "mozilla/dom/nsSynthVoiceRegistry.h" 15 #include "nsEscape.h" 16 #include "nsISupports.h" 17 #include "nsServiceManagerUtils.h" 18 #include "nsXULAppAPI.h" 19 20 namespace mozilla::dom { 21 22 constexpr static WCHAR kSpCategoryOneCoreVoices[] = 23 L"HKEY_LOCAL_MACHINE\\SOFTWARE\\Microsoft\\Speech_OneCore\\Voices"; 24 25 StaticRefPtr<SapiService> SapiService::sSingleton; 26 27 class SapiCallback final : public nsISpeechTaskCallback { 28 public: 29 SapiCallback(nsISpeechTask* aTask, ISpVoice* aSapiClient, 30 uint32_t aTextOffset, uint32_t aSpeakTextLen) 31 : mTask(aTask), 32 mSapiClient(aSapiClient), 33 mTextOffset(aTextOffset), 34 mSpeakTextLen(aSpeakTextLen), 35 mCurrentIndex(0), 36 mStreamNum(0) { 37 mStartingTime = TimeStamp::Now(); 38 } 39 40 NS_DECL_CYCLE_COLLECTING_ISUPPORTS 41 NS_DECL_CYCLE_COLLECTION_CLASS_AMBIGUOUS(SapiCallback, nsISpeechTaskCallback) 42 43 NS_DECL_NSISPEECHTASKCALLBACK 44 45 ULONG GetStreamNum() const { return mStreamNum; } 46 void SetStreamNum(ULONG aValue) { mStreamNum = aValue; } 47 48 void OnSpeechEvent(const SPEVENT& speechEvent); 49 50 private: 51 ~SapiCallback() {} 52 53 float GetTimeDurationFromStart() const { 54 TimeDuration duration = TimeStamp::Now() - mStartingTime; 55 return duration.ToSeconds(); 56 } 57 58 // This pointer is used to dispatch events 59 nsCOMPtr<nsISpeechTask> mTask; 60 RefPtr<ISpVoice> mSapiClient; 61 62 uint32_t mTextOffset; 63 uint32_t mSpeakTextLen; 64 65 // Used for calculating the time taken to speak the utterance 66 TimeStamp mStartingTime; 67 uint32_t mCurrentIndex; 68 69 ULONG mStreamNum; 70 }; 71 72 NS_IMPL_CYCLE_COLLECTION(SapiCallback, mTask); 73 74 NS_INTERFACE_MAP_BEGIN_CYCLE_COLLECTION(SapiCallback) 75 NS_INTERFACE_MAP_ENTRY(nsISpeechTaskCallback) 76 NS_INTERFACE_MAP_ENTRY_AMBIGUOUS(nsISupports, nsISpeechTaskCallback) 77 NS_INTERFACE_MAP_END 78 79 NS_IMPL_CYCLE_COLLECTING_ADDREF(SapiCallback) 80 NS_IMPL_CYCLE_COLLECTING_RELEASE(SapiCallback) 81 82 NS_IMETHODIMP 83 SapiCallback::OnPause() { 84 if (FAILED(mSapiClient->Pause())) { 85 return NS_ERROR_FAILURE; 86 } 87 if (!mTask) { 88 // When calling pause() on child porcess, it may not receive end event 89 // from chrome process yet. 90 return NS_ERROR_FAILURE; 91 } 92 mTask->DispatchPause(GetTimeDurationFromStart(), mCurrentIndex); 93 return NS_OK; 94 } 95 96 NS_IMETHODIMP 97 SapiCallback::OnResume() { 98 if (FAILED(mSapiClient->Resume())) { 99 return NS_ERROR_FAILURE; 100 } 101 if (!mTask) { 102 // When calling resume() on child porcess, it may not receive end event 103 // from chrome process yet. 104 return NS_ERROR_FAILURE; 105 } 106 mTask->DispatchResume(GetTimeDurationFromStart(), mCurrentIndex); 107 return NS_OK; 108 } 109 110 NS_IMETHODIMP 111 SapiCallback::OnCancel() { 112 // After cancel, mCurrentIndex may be updated. 113 // At cancel case, use mCurrentIndex for DispatchEnd. 114 mSpeakTextLen = 0; 115 // Purge all the previous utterances and speak an empty string 116 if (FAILED(mSapiClient->Speak(nullptr, SPF_PURGEBEFORESPEAK, nullptr))) { 117 return NS_ERROR_FAILURE; 118 } 119 return NS_OK; 120 } 121 122 NS_IMETHODIMP 123 SapiCallback::OnVolumeChanged(float aVolume) { 124 mSapiClient->SetVolume(static_cast<USHORT>(aVolume * 100)); 125 return NS_OK; 126 } 127 128 void SapiCallback::OnSpeechEvent(const SPEVENT& speechEvent) { 129 switch (speechEvent.eEventId) { 130 case SPEI_START_INPUT_STREAM: 131 mTask->DispatchStart(); 132 break; 133 case SPEI_END_INPUT_STREAM: 134 if (mSpeakTextLen) { 135 mCurrentIndex = mSpeakTextLen; 136 } 137 mTask->DispatchEnd(GetTimeDurationFromStart(), mCurrentIndex); 138 mTask = nullptr; 139 break; 140 case SPEI_TTS_BOOKMARK: 141 mCurrentIndex = static_cast<ULONG>(speechEvent.lParam) - mTextOffset; 142 mTask->DispatchBoundary(u"mark"_ns, GetTimeDurationFromStart(), 143 mCurrentIndex, 0, 0); 144 break; 145 case SPEI_WORD_BOUNDARY: 146 mCurrentIndex = static_cast<ULONG>(speechEvent.lParam) - mTextOffset; 147 mTask->DispatchBoundary(u"word"_ns, GetTimeDurationFromStart(), 148 mCurrentIndex, 149 static_cast<ULONG>(speechEvent.wParam), 1); 150 break; 151 case SPEI_SENTENCE_BOUNDARY: 152 mCurrentIndex = static_cast<ULONG>(speechEvent.lParam) - mTextOffset; 153 mTask->DispatchBoundary(u"sentence"_ns, GetTimeDurationFromStart(), 154 mCurrentIndex, 155 static_cast<ULONG>(speechEvent.wParam), 1); 156 break; 157 default: 158 break; 159 } 160 } 161 162 // static 163 void __stdcall SapiService::SpeechEventCallback(WPARAM aWParam, 164 LPARAM aLParam) { 165 RefPtr<ISpVoice> spVoice = (ISpVoice*)aWParam; 166 RefPtr<SapiService> service = (SapiService*)aLParam; 167 168 SPEVENT speechEvent; 169 while (spVoice->GetEvents(1, &speechEvent, nullptr) == S_OK) { 170 for (size_t i = 0; i < service->mCallbacks.Length(); i++) { 171 RefPtr<SapiCallback> callback = service->mCallbacks[i]; 172 if (callback->GetStreamNum() == speechEvent.ulStreamNum) { 173 callback->OnSpeechEvent(speechEvent); 174 if (speechEvent.eEventId == SPEI_END_INPUT_STREAM) { 175 service->mCallbacks.RemoveElementAt(i); 176 } 177 break; 178 } 179 } 180 } 181 } 182 183 NS_INTERFACE_MAP_BEGIN(SapiService) 184 NS_INTERFACE_MAP_ENTRY(nsISpeechService) 185 NS_INTERFACE_MAP_ENTRY(nsIObserver) 186 NS_INTERFACE_MAP_ENTRY_AMBIGUOUS(nsISupports, nsISpeechService) 187 NS_INTERFACE_MAP_END 188 189 NS_IMPL_ADDREF(SapiService) 190 NS_IMPL_RELEASE(SapiService) 191 192 SapiService::SapiService() : mInitialized(false) {} 193 194 SapiService::~SapiService() {} 195 196 bool SapiService::Init() { 197 AUTO_PROFILER_LABEL("SapiService::Init", OTHER); 198 199 MOZ_ASSERT(!mInitialized); 200 201 if (Preferences::GetBool("media.webspeech.synth.test") || 202 !StaticPrefs::media_webspeech_synth_enabled()) { 203 // When enabled, we shouldn't add OS backend (Bug 1160844) 204 return false; 205 } 206 207 // Get all the voices from sapi and register in the SynthVoiceRegistry 208 if (!RegisterVoices()) { 209 return false; 210 } 211 212 mInitialized = true; 213 return true; 214 } 215 216 already_AddRefed<ISpVoice> SapiService::InitSapiInstance() { 217 RefPtr<ISpVoice> spVoice; 218 if (FAILED(CoCreateInstance(CLSID_SpVoice, nullptr, CLSCTX_ALL, IID_ISpVoice, 219 getter_AddRefs(spVoice)))) { 220 return nullptr; 221 } 222 223 // Set interest for all the events we are interested in 224 ULONGLONG eventMask = SPFEI(SPEI_START_INPUT_STREAM) | 225 SPFEI(SPEI_TTS_BOOKMARK) | SPFEI(SPEI_WORD_BOUNDARY) | 226 SPFEI(SPEI_SENTENCE_BOUNDARY) | 227 SPFEI(SPEI_END_INPUT_STREAM); 228 229 if (FAILED(spVoice->SetInterest(eventMask, eventMask))) { 230 return nullptr; 231 } 232 233 // Set the callback function for receiving the events 234 spVoice->SetNotifyCallbackFunction( 235 (SPNOTIFYCALLBACK*)SapiService::SpeechEventCallback, 236 (WPARAM)spVoice.get(), (LPARAM)this); 237 238 return spVoice.forget(); 239 } 240 241 bool SapiService::RegisterVoices() { 242 nsCOMPtr<nsISynthVoiceRegistry> registry = 243 do_GetService(NS_SYNTHVOICEREGISTRY_CONTRACTID); 244 if (!registry) { 245 return false; 246 } 247 bool result = RegisterVoices(registry, kSpCategoryOneCoreVoices); 248 result |= RegisterVoices(registry, SPCAT_VOICES); 249 if (result) { 250 registry->NotifyVoicesChanged(); 251 } 252 return result; 253 } 254 255 bool SapiService::RegisterVoices(nsCOMPtr<nsISynthVoiceRegistry>& registry, 256 const WCHAR* categoryId) { 257 nsresult rv; 258 259 RefPtr<ISpObjectTokenCategory> category; 260 if (FAILED(CoCreateInstance(CLSID_SpObjectTokenCategory, nullptr, CLSCTX_ALL, 261 IID_ISpObjectTokenCategory, 262 getter_AddRefs(category)))) { 263 return false; 264 } 265 if (FAILED(category->SetId(categoryId, FALSE))) { 266 return false; 267 } 268 269 RefPtr<IEnumSpObjectTokens> voiceTokens; 270 if (FAILED(category->EnumTokens(nullptr, nullptr, 271 getter_AddRefs(voiceTokens)))) { 272 return false; 273 } 274 275 WCHAR locale[LOCALE_NAME_MAX_LENGTH]; 276 while (true) { 277 RefPtr<ISpObjectToken> voiceToken; 278 if (voiceTokens->Next(1, getter_AddRefs(voiceToken), nullptr) != S_OK) { 279 break; 280 } 281 282 RefPtr<ISpDataKey> attributes; 283 if (FAILED( 284 voiceToken->OpenKey(L"Attributes", getter_AddRefs(attributes)))) { 285 continue; 286 } 287 288 WCHAR* language = nullptr; 289 if (FAILED(attributes->GetStringValue(L"Language", &language))) { 290 continue; 291 } 292 293 // Language attribute is LCID by hex. So we need convert to locale 294 // name. 295 nsAutoString hexLcid; 296 LCID lcid = wcstol(language, nullptr, 16); 297 CoTaskMemFree(language); 298 if (NS_WARN_IF( 299 !LCIDToLocaleName(lcid, locale, LOCALE_NAME_MAX_LENGTH, 0))) { 300 continue; 301 } 302 303 WCHAR* description = nullptr; 304 if (FAILED(voiceToken->GetStringValue(nullptr, &description))) { 305 continue; 306 } 307 308 nsAutoString uri; 309 uri.AssignLiteral("urn:moz-tts:sapi:"); 310 uri.Append(description); 311 uri.AppendLiteral("?"); 312 uri.Append(locale); 313 314 // This service can only speak one utterance at a time, se we set 315 // aQueuesUtterances to true in order to track global state and schedule 316 // access to this service. 317 rv = registry->AddVoice(this, uri, nsDependentString(description), 318 nsDependentString(locale), true, true); 319 CoTaskMemFree(description); 320 if (NS_FAILED(rv)) { 321 continue; 322 } 323 324 mVoices.InsertOrUpdate(uri, std::move(voiceToken)); 325 } 326 327 return true; 328 } 329 330 NS_IMETHODIMP 331 SapiService::Speak(const nsAString& aText, const nsAString& aUri, float aVolume, 332 float aRate, float aPitch, nsISpeechTask* aTask) { 333 NS_ENSURE_TRUE(mInitialized, NS_ERROR_NOT_AVAILABLE); 334 335 RefPtr<ISpObjectToken> voiceToken; 336 if (!mVoices.Get(aUri, getter_AddRefs(voiceToken))) { 337 return NS_ERROR_NOT_AVAILABLE; 338 } 339 340 RefPtr<ISpVoice> spVoice = InitSapiInstance(); 341 if (!spVoice) { 342 return NS_ERROR_FAILURE; 343 } 344 345 if (FAILED(spVoice->SetVoice(voiceToken))) { 346 return NS_ERROR_FAILURE; 347 } 348 349 if (FAILED(spVoice->SetVolume(static_cast<USHORT>(aVolume * 100)))) { 350 return NS_ERROR_FAILURE; 351 } 352 353 // The max supported rate in SAPI engines is 3x, and the min is 1/3x. It is 354 // expressed by an integer. 0 being normal rate, -10 is 1/3 and 10 is 3x. 355 // Values below and above that are allowed, but the engine may clip the rate 356 // to its maximum capable value. 357 // "Each increment between -10 and +10 is logarithmically distributed such 358 // that incrementing or decrementing by 1 is multiplying or dividing the 359 // rate by the 10th root of 3" 360 // https://msdn.microsoft.com/en-us/library/ee431826(v=vs.85).aspx 361 long rate = aRate != 0 ? static_cast<long>(10 * log10(aRate) / log10(3)) : 0; 362 if (FAILED(spVoice->SetRate(rate))) { 363 return NS_ERROR_FAILURE; 364 } 365 366 // Set the pitch using xml 367 nsAutoString xml; 368 xml.AssignLiteral("<pitch absmiddle=\""); 369 // absmiddle doesn't allow float type 370 xml.AppendInt(static_cast<int32_t>(aPitch * 10.0f - 10.0f)); 371 xml.AppendLiteral("\">"); 372 uint32_t textOffset = xml.Length(); 373 374 for (size_t i = 0; i < aText.Length(); i++) { 375 switch (aText[i]) { 376 case '&': 377 xml.AppendLiteral("&"); 378 break; 379 case '<': 380 xml.AppendLiteral("<"); 381 break; 382 case '>': 383 xml.AppendLiteral(">"); 384 break; 385 default: 386 xml.Append(aText[i]); 387 break; 388 } 389 } 390 391 xml.AppendLiteral("</pitch>"); 392 393 RefPtr<SapiCallback> callback = 394 new SapiCallback(aTask, spVoice, textOffset, aText.Length()); 395 396 // The last three parameters doesn't matter for an indirect service 397 nsresult rv = aTask->Setup(callback); 398 if (NS_FAILED(rv)) { 399 return rv; 400 } 401 402 ULONG streamNum; 403 if (FAILED(spVoice->Speak(xml.get(), SPF_ASYNC, &streamNum))) { 404 aTask->Setup(nullptr); 405 return NS_ERROR_FAILURE; 406 } 407 408 callback->SetStreamNum(streamNum); 409 // streamNum reassigns same value when last stream is finished even if 410 // callback for stream end isn't called 411 // So we cannot use data hashtable and has to add it to vector at last. 412 mCallbacks.AppendElement(callback); 413 414 return NS_OK; 415 } 416 417 NS_IMETHODIMP 418 SapiService::Observe(nsISupports* aSubject, const char* aTopic, 419 const char16_t* aData) { 420 return NS_OK; 421 } 422 423 SapiService* SapiService::GetInstance() { 424 MOZ_ASSERT(NS_IsMainThread()); 425 if (XRE_GetProcessType() != GeckoProcessType_Default) { 426 MOZ_ASSERT(false, "SapiService can only be started on main gecko process"); 427 return nullptr; 428 } 429 430 if (!sSingleton) { 431 RefPtr<SapiService> service = new SapiService(); 432 if (service->Init()) { 433 sSingleton = service; 434 ClearOnShutdown(&sSingleton); 435 } 436 } 437 return sSingleton; 438 } 439 440 already_AddRefed<SapiService> SapiService::GetInstanceForService() { 441 RefPtr<SapiService> sapiService = GetInstance(); 442 return sapiService.forget(); 443 } 444 445 } // namespace mozilla::dom