tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

OnlineSpeechRecognitionService.cpp (14953B)


      1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
      2 /* vim:set ts=2 sw=2 sts=2 et cindent: */
      3 /* This Source Code Form is subject to the terms of the Mozilla Public
      4 * License, v. 2.0. If a copy of the MPL was not distributed with this
      5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      6 
      7 #include "OnlineSpeechRecognitionService.h"
      8 
      9 #include <json/json.h>
     10 #include <string.h>
     11 
     12 #include "OggWriter.h"
     13 #include "OpusTrackEncoder.h"
     14 #include "SpeechGrammar.h"
     15 #include "SpeechRecognition.h"
     16 #include "SpeechRecognitionAlternative.h"
     17 #include "SpeechRecognitionResult.h"
     18 #include "SpeechRecognitionResultList.h"
     19 #include "mozilla/Preferences.h"
     20 #include "mozilla/ScopeExit.h"
     21 #include "mozilla/dom/Document.h"
     22 #include "nsContentUtils.h"
     23 #include "nsGlobalWindowInner.h"
     24 #include "nsIChannel.h"
     25 #include "nsIClassOfService.h"
     26 #include "nsIHttpChannel.h"
     27 #include "nsIOutputStream.h"
     28 #include "nsIPrincipal.h"
     29 #include "nsIStreamListener.h"
     30 #include "nsIUploadChannel2.h"
     31 #include "nsNetUtil.h"
     32 #include "nsStringStream.h"
     33 #include "nsThreadUtils.h"
     34 
     35 namespace mozilla {
     36 
     37 using namespace dom;
     38 
     39 #define PREFERENCE_DEFAULT_RECOGNITION_ENDPOINT \
     40  "media.webspeech.service.endpoint"
     41 #define DEFAULT_RECOGNITION_ENDPOINT "https://speaktome-2.services.mozilla.com/"
     42 #define MAX_LISTENING_TIME_MS 10000
     43 
     44 NS_IMPL_ISUPPORTS(OnlineSpeechRecognitionService, nsISpeechRecognitionService,
     45                  nsIStreamListener)
     46 
     47 NS_IMETHODIMP
     48 OnlineSpeechRecognitionService::OnStartRequest(nsIRequest* aRequest) {
     49  MOZ_ASSERT(NS_IsMainThread());
     50  return NS_OK;
     51 }
     52 
     53 static nsresult AssignResponseToBuffer(nsIInputStream* aIn, void* aClosure,
     54                                       const char* aFromRawSegment,
     55                                       uint32_t aToOffset, uint32_t aCount,
     56                                       uint32_t* aWriteCount) {
     57  nsCString* buf = static_cast<nsCString*>(aClosure);
     58  buf->Append(aFromRawSegment, aCount);
     59  *aWriteCount = aCount;
     60  return NS_OK;
     61 }
     62 
     63 NS_IMETHODIMP
     64 OnlineSpeechRecognitionService::OnDataAvailable(nsIRequest* aRequest,
     65                                                nsIInputStream* aInputStream,
     66                                                uint64_t aOffset,
     67                                                uint32_t aCount) {
     68  MOZ_ASSERT(NS_IsMainThread());
     69  nsresult rv;
     70  uint32_t readCount;
     71  rv = aInputStream->ReadSegments(AssignResponseToBuffer, &mBuf, aCount,
     72                                  &readCount);
     73  NS_ENSURE_SUCCESS(rv, rv);
     74  return NS_OK;
     75 }
     76 
     77 NS_IMETHODIMP
     78 OnlineSpeechRecognitionService::OnStopRequest(nsIRequest* aRequest,
     79                                              nsresult aStatusCode) {
     80  MOZ_ASSERT(NS_IsMainThread());
     81 
     82  auto clearBuf = MakeScopeExit([&] { mBuf.Truncate(); });
     83 
     84  if (mAborted) {
     85    return NS_OK;
     86  }
     87 
     88  bool success;
     89  float confidence = 0;
     90  Json::Value root;
     91  Json::CharReaderBuilder builder;
     92  bool parsingSuccessful;
     93  nsAutoCString result;
     94  nsAutoCString hypoValue;
     95  nsAutoCString errorMsg;
     96  SpeechRecognitionErrorCode errorCode;
     97 
     98  SR_LOG("STT Result: %s", mBuf.get());
     99 
    100  if (NS_FAILED(aStatusCode)) {
    101    success = false;
    102    errorMsg.AssignLiteral("Error connecting to the service.");
    103    errorCode = SpeechRecognitionErrorCode::Network;
    104  } else {
    105    success = true;
    106    UniquePtr<Json::CharReader> const reader(builder.newCharReader());
    107    parsingSuccessful =
    108        reader->parse(mBuf.BeginReading(), mBuf.EndReading(), &root, nullptr);
    109    if (!parsingSuccessful) {
    110      // there's an internal server error
    111      success = false;
    112      errorMsg.AssignLiteral("Internal server error");
    113      errorCode = SpeechRecognitionErrorCode::Network;
    114    } else {
    115      result.Assign(root.get("status", "error").asString().c_str());
    116      if (result.EqualsLiteral("ok")) {
    117        // ok, we have a result
    118        if (!root["data"].empty()) {
    119          hypoValue.Assign(root["data"][0].get("text", "").asString().c_str());
    120          confidence = root["data"][0].get("confidence", "0").asFloat();
    121        } else {
    122          success = false;
    123          errorMsg.AssignLiteral("Error reading result data.");
    124          errorCode = SpeechRecognitionErrorCode::Network;
    125        }
    126      } else {
    127        success = false;
    128        errorMsg.Assign(root.get("message", "").asString().c_str());
    129        errorCode = SpeechRecognitionErrorCode::No_speech;
    130      }
    131    }
    132  }
    133 
    134  if (!success) {
    135    mRecognition->DispatchError(
    136        SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR, errorCode, errorMsg);
    137  } else {
    138    // Declare javascript result events
    139    RefPtr<SpeechEvent> event = new SpeechEvent(
    140        mRecognition, SpeechRecognition::EVENT_RECOGNITIONSERVICE_FINAL_RESULT);
    141    SpeechRecognitionResultList* resultList =
    142        new SpeechRecognitionResultList(mRecognition);
    143    SpeechRecognitionResult* result = new SpeechRecognitionResult(mRecognition);
    144 
    145    if (mRecognition->MaxAlternatives() > 0) {
    146      SpeechRecognitionAlternative* alternative =
    147          new SpeechRecognitionAlternative(mRecognition);
    148 
    149      alternative->mTranscript = NS_ConvertUTF8toUTF16(hypoValue);
    150      alternative->mConfidence = confidence;
    151 
    152      result->mItems.AppendElement(alternative);
    153    }
    154    resultList->mItems.AppendElement(result);
    155 
    156    event->mRecognitionResultList = resultList;
    157    NS_DispatchToMainThread(event);
    158  }
    159 
    160  return NS_OK;
    161 }
    162 
    163 OnlineSpeechRecognitionService::OnlineSpeechRecognitionService() = default;
    164 OnlineSpeechRecognitionService::~OnlineSpeechRecognitionService() = default;
    165 
    166 NS_IMETHODIMP
    167 OnlineSpeechRecognitionService::Initialize(
    168    WeakPtr<SpeechRecognition> aSpeechRecognition) {
    169  MOZ_ASSERT(NS_IsMainThread());
    170  mWriter = MakeUnique<OggWriter>();
    171  mRecognition = new nsMainThreadPtrHolder<SpeechRecognition>(
    172      "OnlineSpeechRecognitionService::mRecognition", aSpeechRecognition);
    173  mEncodeTaskQueue = mRecognition->GetTaskQueueForEncoding();
    174  MOZ_ASSERT(mEncodeTaskQueue);
    175  return NS_OK;
    176 }
    177 
    178 void OnlineSpeechRecognitionService::EncoderFinished() {
    179  MOZ_ASSERT(!NS_IsMainThread());
    180  MOZ_ASSERT(mEncodedAudioQueue.IsFinished());
    181 
    182  while (RefPtr<EncodedFrame> frame = mEncodedAudioQueue.PopFront()) {
    183    AutoTArray<RefPtr<EncodedFrame>, 1> frames({frame});
    184    DebugOnly<nsresult> rv =
    185        mWriter->WriteEncodedTrack(frames, mEncodedAudioQueue.AtEndOfStream()
    186                                               ? ContainerWriter::END_OF_STREAM
    187                                               : 0);
    188    MOZ_ASSERT(NS_SUCCEEDED(rv));
    189  }
    190 
    191  mWriter->GetContainerData(&mEncodedData, ContainerWriter::FLUSH_NEEDED);
    192  MOZ_ASSERT(mWriter->IsWritingComplete());
    193 
    194  NS_DispatchToMainThread(
    195      NewRunnableMethod("OnlineSpeechRecognitionService::DoSTT", this,
    196                        &OnlineSpeechRecognitionService::DoSTT));
    197 }
    198 
    199 void OnlineSpeechRecognitionService::EncoderInitialized() {
    200  MOZ_ASSERT(!NS_IsMainThread());
    201  AutoTArray<RefPtr<TrackMetadataBase>, 1> metadata;
    202  metadata.AppendElement(mAudioEncoder->GetMetadata());
    203  if (metadata[0]->GetKind() != TrackMetadataBase::METADATA_OPUS) {
    204    SR_LOG("wrong meta data type!");
    205    MOZ_ASSERT_UNREACHABLE();
    206  }
    207 
    208  nsresult rv = mWriter->SetMetadata(metadata);
    209  MOZ_DIAGNOSTIC_ASSERT(NS_SUCCEEDED(rv));
    210 
    211  rv = mWriter->GetContainerData(&mEncodedData, ContainerWriter::GET_HEADER);
    212  MOZ_DIAGNOSTIC_ASSERT(NS_SUCCEEDED(rv));
    213 
    214  (void)rv;
    215 }
    216 
    217 void OnlineSpeechRecognitionService::EncoderError() {
    218  MOZ_ASSERT(!NS_IsMainThread());
    219  SR_LOG("Error encoding frames.");
    220  mEncodedData.Clear();
    221  NS_DispatchToMainThread(NS_NewRunnableFunction(
    222      "SpeechRecognition::DispatchError",
    223      [this, self = RefPtr<OnlineSpeechRecognitionService>(this)]() {
    224        if (!mRecognition) {
    225          return;
    226        }
    227        mRecognition->DispatchError(
    228            SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR,
    229            SpeechRecognitionErrorCode::Audio_capture, "Encoder error");
    230      }));
    231 }
    232 
    233 NS_IMETHODIMP
    234 OnlineSpeechRecognitionService::ProcessAudioSegment(AudioSegment* aAudioSegment,
    235                                                    int32_t aSampleRate) {
    236  MOZ_ASSERT(!NS_IsMainThread());
    237  int64_t duration = aAudioSegment->GetDuration();
    238  if (duration <= 0) {
    239    return NS_OK;
    240  }
    241 
    242  if (!mAudioEncoder) {
    243    mSpeechEncoderListener = new SpeechEncoderListener(this);
    244    mAudioEncoder =
    245        MakeUnique<OpusTrackEncoder>(aSampleRate, mEncodedAudioQueue);
    246    RefPtr<AbstractThread> mEncoderThread = AbstractThread::GetCurrent();
    247    mAudioEncoder->SetWorkerThread(mEncoderThread);
    248    mAudioEncoder->RegisterListener(mSpeechEncoderListener);
    249  }
    250 
    251  mAudioEncoder->AppendAudioSegment(std::move(*aAudioSegment));
    252 
    253  TimeStamp now = TimeStamp::Now();
    254  if (mFirstIteration.IsNull()) {
    255    mFirstIteration = now;
    256  }
    257 
    258  if ((now - mFirstIteration).ToMilliseconds() >= MAX_LISTENING_TIME_MS) {
    259    NS_DispatchToMainThread(NS_NewRunnableFunction(
    260        "SpeechRecognition::Stop",
    261        [this, self = RefPtr<OnlineSpeechRecognitionService>(this)]() {
    262          if (!mRecognition) {
    263            return;
    264          }
    265          mRecognition->Stop();
    266        }));
    267 
    268    return NS_OK;
    269  }
    270 
    271  return NS_OK;
    272 }
    273 
    274 void OnlineSpeechRecognitionService::DoSTT() {
    275  MOZ_ASSERT(NS_IsMainThread());
    276 
    277  if (mAborted) {
    278    return;
    279  }
    280 
    281  nsresult rv;
    282  nsCOMPtr<nsIChannel> chan;
    283  nsCOMPtr<nsIURI> uri;
    284  nsAutoCString speechRecognitionEndpoint;
    285  nsAutoCString prefEndpoint;
    286  nsAutoString language;
    287 
    288  Preferences::GetCString(PREFERENCE_DEFAULT_RECOGNITION_ENDPOINT,
    289                          prefEndpoint);
    290 
    291  if (!prefEndpoint.IsEmpty()) {
    292    speechRecognitionEndpoint = prefEndpoint;
    293  } else {
    294    speechRecognitionEndpoint = DEFAULT_RECOGNITION_ENDPOINT;
    295  }
    296 
    297  rv = NS_NewURI(getter_AddRefs(uri), speechRecognitionEndpoint, nullptr,
    298                 nullptr);
    299  if (NS_WARN_IF(NS_FAILED(rv))) {
    300    mRecognition->DispatchError(
    301        SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR,
    302        SpeechRecognitionErrorCode::Network, "Unknown URI");
    303    return;
    304  }
    305 
    306  nsSecurityFlags secFlags = nsILoadInfo::SEC_REQUIRE_CORS_INHERITS_SEC_CONTEXT;
    307  nsLoadFlags loadFlags =
    308      nsIRequest::LOAD_NORMAL | nsIChannel::LOAD_BYPASS_SERVICE_WORKER;
    309  nsContentPolicyType contentPolicy = nsIContentPolicy::TYPE_OTHER;
    310 
    311  nsGlobalWindowInner* window = mRecognition->GetOwnerWindow();
    312  if (NS_WARN_IF(!window)) {
    313    mRecognition->DispatchError(
    314        SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR,
    315        SpeechRecognitionErrorCode::Aborted, "No window");
    316    return;
    317  }
    318 
    319  Document* doc = window->GetExtantDoc();
    320  if (NS_WARN_IF(!doc)) {
    321    mRecognition->DispatchError(
    322        SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR,
    323        SpeechRecognitionErrorCode::Aborted, "No document");
    324  }
    325  rv = NS_NewChannel(getter_AddRefs(chan), uri, doc->NodePrincipal(), secFlags,
    326                     contentPolicy, nullptr, nullptr, nullptr, nullptr,
    327                     loadFlags);
    328  if (NS_WARN_IF(NS_FAILED(rv))) {
    329    mRecognition->DispatchError(
    330        SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR,
    331        SpeechRecognitionErrorCode::Network, "Failed to open channel");
    332    return;
    333  }
    334 
    335  nsCOMPtr<nsIHttpChannel> httpChan = do_QueryInterface(chan);
    336  if (httpChan) {
    337    rv = httpChan->SetRequestMethod("POST"_ns);
    338    MOZ_RELEASE_ASSERT(NS_SUCCEEDED(rv));
    339  }
    340 
    341  if (httpChan) {
    342    mRecognition->GetLang(language);
    343    // Accept-Language-STT is a custom header of our backend server used to set
    344    // the language of the speech sample being submitted by the client
    345    rv = httpChan->SetRequestHeader("Accept-Language-STT"_ns,
    346                                    NS_ConvertUTF16toUTF8(language), false);
    347    MOZ_RELEASE_ASSERT(NS_SUCCEEDED(rv));
    348    // Tell the server to not store the transcription by default
    349    rv = httpChan->SetRequestHeader("Store-Transcription"_ns, "0"_ns, false);
    350    MOZ_RELEASE_ASSERT(NS_SUCCEEDED(rv));
    351    // Tell the server to not store the sample by default
    352    rv = httpChan->SetRequestHeader("Store-Sample"_ns, "0"_ns, false);
    353    MOZ_RELEASE_ASSERT(NS_SUCCEEDED(rv));
    354    // Set the product tag as the web speech api
    355    rv = httpChan->SetRequestHeader("Product-Tag"_ns, "wsa"_ns, false);
    356    MOZ_RELEASE_ASSERT(NS_SUCCEEDED(rv));
    357  }
    358 
    359  nsCOMPtr<nsIClassOfService> cos(do_QueryInterface(chan));
    360  if (cos) {
    361    cos->AddClassFlags(nsIClassOfService::UrgentStart);
    362  }
    363 
    364  nsCOMPtr<nsIUploadChannel2> uploadChan = do_QueryInterface(chan);
    365  if (uploadChan) {
    366    nsCOMPtr<nsIInputStream> bodyStream;
    367    uint32_t length = 0;
    368    for (const nsTArray<uint8_t>& chunk : mEncodedData) {
    369      length += chunk.Length();
    370    }
    371 
    372    nsTArray<uint8_t> audio;
    373    if (!audio.SetCapacity(length, fallible)) {
    374      mRecognition->DispatchError(
    375          SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR,
    376          SpeechRecognitionErrorCode::Audio_capture, "Allocation error");
    377      return;
    378    }
    379 
    380    for (const nsTArray<uint8_t>& chunk : mEncodedData) {
    381      audio.AppendElements(chunk);
    382    }
    383 
    384    mEncodedData.Clear();
    385 
    386    rv = NS_NewByteInputStream(getter_AddRefs(bodyStream), std::move(audio));
    387    if (NS_WARN_IF(NS_FAILED(rv))) {
    388      mRecognition->DispatchError(
    389          SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR,
    390          SpeechRecognitionErrorCode::Network, "Failed to open stream");
    391      return;
    392    }
    393    if (bodyStream) {
    394      rv = uploadChan->ExplicitSetUploadStream(bodyStream, "audio/ogg"_ns,
    395                                               length, "POST"_ns, false);
    396      MOZ_RELEASE_ASSERT(NS_SUCCEEDED(rv));
    397    }
    398  }
    399 
    400  rv = chan->AsyncOpen(this);
    401  if (NS_WARN_IF(NS_FAILED(rv))) {
    402    mRecognition->DispatchError(
    403        SpeechRecognition::EVENT_RECOGNITIONSERVICE_ERROR,
    404        SpeechRecognitionErrorCode::Network, "Internal server error");
    405  }
    406 }
    407 
    408 NS_IMETHODIMP
    409 OnlineSpeechRecognitionService::SoundEnd() {
    410  MOZ_ASSERT(NS_IsMainThread());
    411 
    412  if (!mEncodeTaskQueue) {
    413    // Not initialized
    414    return NS_OK;
    415  }
    416 
    417  nsresult rv = mEncodeTaskQueue->Dispatch(NS_NewRunnableFunction(
    418      "OnlineSpeechRecognitionService::SoundEnd",
    419      [this, self = RefPtr<OnlineSpeechRecognitionService>(this)]() {
    420        if (mAudioEncoder) {
    421          mAudioEncoder->NotifyEndOfStream();
    422          mAudioEncoder->UnregisterListener(mSpeechEncoderListener);
    423          mSpeechEncoderListener = nullptr;
    424          mAudioEncoder = nullptr;
    425          EncoderFinished();
    426        }
    427      }));
    428  MOZ_DIAGNOSTIC_ASSERT(NS_SUCCEEDED(rv));
    429  (void)rv;
    430 
    431  mEncodeTaskQueue = nullptr;
    432 
    433  return NS_OK;
    434 }
    435 
    436 NS_IMETHODIMP
    437 OnlineSpeechRecognitionService::ValidateAndSetGrammarList(
    438    SpeechGrammar* aSpeechGrammar,
    439    nsISpeechGrammarCompilationCallback* aCallback) {
    440  // This is an online LVCSR (STT) service,
    441  // so we don't need to set a grammar
    442  return NS_OK;
    443 }
    444 
    445 NS_IMETHODIMP
    446 OnlineSpeechRecognitionService::Abort() {
    447  MOZ_ASSERT(NS_IsMainThread());
    448  if (mAborted) {
    449    return NS_OK;
    450  }
    451  mAborted = true;
    452  return SoundEnd();
    453 }
    454 }  // namespace mozilla