tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

commit a07fd9772bbb931e789d55a0af554a7a8850d5b1
parent ccf612b0a4ae5727f7c50623a3e635db90e98c57
Author: Erik Nordin <enordin@mozilla.com>
Date:   Fri, 24 Oct 2025 19:56:59 +0000

Bug 1995634 - Use nsIDocumentEncoder for Translations langId r=translations-reviewers,gregtatum

This patch updates the Translations language-detection algorithm
to utilize nsIDocumentEncoder for the page-text extraction.

Differential Revision: https://phabricator.services.mozilla.com/D269460

Diffstat:
Mtoolkit/components/translations/actors/TranslationsChild.sys.mjs | 21+++++++++++++++++++++
Mtoolkit/components/translations/actors/TranslationsParent.sys.mjs | 71+++++++++++++++++++++++++++++++++++++++++------------------------------
2 files changed, 62 insertions(+), 30 deletions(-)

diff --git a/toolkit/components/translations/actors/TranslationsChild.sys.mjs b/toolkit/components/translations/actors/TranslationsChild.sys.mjs @@ -78,6 +78,27 @@ export class TranslationsChild extends JSWindowActorChild { this.#translatedDoc?.enterLazyTranslationsMode(); return undefined; } + case "Translations:ExtractPageText": { + const { document } = this; + if (!document) { + return ""; + } + + const { sufficientLength } = data; + + const encoder = Cu.createDocumentEncoder("text/plain"); + encoder.init( + document, + "text/plain", + Ci.nsIDocumentEncoder.OutputBodyOnly | + Ci.nsIDocumentEncoder.SkipInvisibleContent | + Ci.nsIDocumentEncoder.AllowCrossShadowBoundary | + Ci.nsIDocumentEncoder.OutputDropInvisibleBreak | + Ci.nsIDocumentEncoder.OutputDisallowLineBreaking + ); + + return encoder.encodeToStringWithMaxLength(sufficientLength); + } case "Translations:TranslatePage": { if (this.#translatedDoc?.engineStatus === "error") { this.#translatedDoc.destroy(); diff --git a/toolkit/components/translations/actors/TranslationsParent.sys.mjs b/toolkit/components/translations/actors/TranslationsParent.sys.mjs @@ -3554,7 +3554,8 @@ export class TranslationsParent extends JSWindowActorParent { } /** - * Uses the page extractor to identify the current page's language. + * Extracts a substring of visible text from the content document and + * runs it through the language detector to determine the page's language. * * @returns {Promise<DetectionResult>} */ @@ -3563,48 +3564,58 @@ export class TranslationsParent extends JSWindowActorParent { return this.languageState.detectedLanguages.identified; } - const actor = - this.browsingContext?.currentWindowGlobal?.getActor("PageExtractor"); - - if (!actor) { - throw new Error("Unable to get the PageExtractor actor."); - } + lazy.console.log( + "Beginning text extraction:", + this.browsingContext?.currentURI?.spec + ); - const startTime = ChromeUtils.now(); + const extractionStartTime = ChromeUtils.now(); + const pageText = await this.sendQuery("Translations:ExtractPageText", { + sufficientLength: 4096, + }); - // Manual profiling on 10 page loads of https://es.wikipedia.org/wiki/Felis_catus: - // ------------------------------------------------------------------------------- - // - // No limit: 2064 samples, 224/237/294 [min/med/max]ms (~85k code units) - // 8192 limit: 681 samples, 75/ 87/128 [min/med/max]ms - // 4096 limit: 457 samples, 51/ 55/ 97 [min/med/max]ms - // 2048 limit: 240 samples, 29/ 39/ 64 [min/med/max]ms - // 1024 limit: 142 samples, 19/ 28/ 58 [min/med/max]ms - // - // 2048 Code units feels like a decent length for performance and sample size. - const pageText = await actor.getText({ sufficientLength: 2048 }); if (this.#isDestroyed) { - return { language: "", confident: false, languages: [] }; + return { language: "en", confident: false, languages: [] }; } + lazy.console.debug( + `Extracted Page Text (${pageText.length} code units):\n\n`, + pageText + ); + + const extractionLog = + `Extracted ${pageText.length} code units of text in ` + + `${(ChromeUtils.now() - extractionStartTime).toFixed(3)} ms.`; + + lazy.console.log(extractionLog); + ChromeUtils.addProfilerMarker( + "TranslationsParent", + { startTime: extractionStartTime, innerWindowId: this.innerWindowId }, + extractionLog + ); + + const identificationStartTime = ChromeUtils.now(); const result = await lazy.LanguageDetector.detectLanguage(pageText); + if (this.#isDestroyed) { - return { language: "", confident: false, languages: [] }; + return { language: "en", confident: false, languages: [] }; } - const message = - `Identified page language as "${result.language}" ` + - `in ${((ChromeUtils.now() - startTime) / 1000).toFixed(3)} seconds: ` + - this.browsingContext?.currentURI?.spec; + const identificationLog = + `Identified ${pageText.length} code units of text as "${result.language}" ` + + `in ${(ChromeUtils.now() - identificationStartTime).toFixed(3)} ms.`; + lazy.console.log(identificationLog); ChromeUtils.addProfilerMarker( "TranslationsParent", - { startTime, innerWindowId: this.innerWindowId }, - message + { startTime: identificationStartTime, innerWindowId: this.innerWindowId }, + identificationLog + ); + ChromeUtils.addProfilerMarker( + "TranslationsParent", + { startTime: extractionStartTime, innerWindowId: this.innerWindowId }, + "Total time to identify page language." ); - - lazy.console.debug("\nExtracted Page Text:\n\n", pageText); - lazy.console.log(message); if (pageText.length < TranslationsParent.#DOC_CONFIDENCE_THRESHOLD) { result.confident = false;