tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

commit 8a4d9fbac8a284bfa430f470f01322d485e28dec
parent 60e47676743cc4a2e3b976864f0a9d01145374f0
Author: Erik Nordin <enordin@mozilla.com>
Date:   Sat, 18 Oct 2025 02:16:51 +0000

Bug 1967758 - Utilize PageExtractor for Translations language detection r=translations-reviewers,gregtatum

This patch updates the TranslationsParent language-identification log
for Full-Page Translations to utilize the new PageExtractor functionality,
which should improve the quality of our language detection compared to
the previous implementation.

Differential Revision: https://phabricator.services.mozilla.com/D267655

Diffstat:
Mtoolkit/components/translations/actors/TranslationsChild.sys.mjs | 29-----------------------------
Mtoolkit/components/translations/actors/TranslationsParent.sys.mjs | 72+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------
2 files changed, 61 insertions(+), 40 deletions(-)

diff --git a/toolkit/components/translations/actors/TranslationsChild.sys.mjs b/toolkit/components/translations/actors/TranslationsChild.sys.mjs @@ -8,8 +8,6 @@ ChromeUtils.defineESModuleGetters(lazy, { "chrome://global/content/translations/translations-document.sys.mjs", LRUCache: "chrome://global/content/translations/translations-document.sys.mjs", - LanguageDetector: - "resource://gre/modules/translations/LanguageDetector.sys.mjs", }); /** @@ -117,33 +115,6 @@ export class TranslationsChild extends JSWindowActorChild { case "Translations:GetDocumentElementLang": { return this.document.documentElement.lang; } - case "Translations:IdentifyLanguage": { - // Wait for idle callback as the page will be more settled if it has - // dynamic content, like on a React app. - if (this.contentWindow) { - await new Promise(resolve => { - this.contentWindow.requestIdleCallback(resolve); - }); - } - - if (this.#isDestroyed) { - return undefined; - } - - const startTime = ChromeUtils.now(); - const detectionResult = - await lazy.LanguageDetector.detectLanguageFromDocument(this.document); - - if (this.#isDestroyed) { - return undefined; - } - - this.addProfilerMarker( - `Detect language from document: ${detectionResult.language}`, - startTime - ); - return detectionResult; - } case "Translations:AcquirePort": { this.addProfilerMarker("Acquired a port, resuming translations"); this.#translatedDoc.acquirePort(data.port); diff --git a/toolkit/components/translations/actors/TranslationsParent.sys.mjs b/toolkit/components/translations/actors/TranslationsParent.sys.mjs @@ -81,6 +81,8 @@ XPCOMUtils.defineLazyServiceGetters(lazy, { }); ChromeUtils.defineESModuleGetters(lazy, { + LanguageDetector: + "resource://gre/modules/translations/LanguageDetector.sys.mjs", RemoteSettings: "resource://services-settings/remote-settings.sys.mjs", setTimeout: "resource://gre/modules/Timer.sys.mjs", TranslationsTelemetry: @@ -445,6 +447,27 @@ export class TranslationsParent extends JSWindowActorParent { static LANGUAGE_MODEL_MAJOR_VERSION_MAX = 3; /** + * The shorter the text, the less confidence we should have in the result of the language + * identification. Add another heuristic to report the ID as not confident if the length + * of the code units of the text is less than this threshold. + * + * This was determined by plotting a kernel density estimation of the number of times the + * source language had to be changed in the SelectTranslationsPanel vs. the code units in + * the source text. + * + * 0013 code units or less - 49.5% of language changes + * 0036 code units or less - 74.9% of language changes + * 0153 code units or less - 90.0% of language changes + * 0200 code units or less - 91.5% of language changes + * 0427 code units or less - 95.0% of language changes + * 1382 code units or less - 98.0% of language changes + * 3506 code units or less - 99.0% of language changes + * + * @type {number} + */ + static #DOC_CONFIDENCE_THRESHOLD = 150; + + /** * Contains the state that would affect UI. Anytime this state is changed, a dispatch * event is sent so that UI can react to it. The actor is inside of /toolkit and * needs a way of notifying /browser code (or other users) of when the state changes. @@ -782,7 +805,7 @@ export class TranslationsParent extends JSWindowActorParent { // popup will not be shown. if (detectedLanguages.htmlLangAttribute && !detectedLanguages.identified) { // Compare language langTagsMatch - detectedLanguages.identified = await this.queryIdentifyLanguage(); + detectedLanguages.identified = await this.#identifyPageLanguage(); if ( !lazy.TranslationsUtils.langTagsMatch( @@ -3470,20 +3493,47 @@ export class TranslationsParent extends JSWindowActorParent { } /** + * Uses the page extractor to identify the current page's language. + * * @returns {Promise<DetectionResult>} */ - async queryIdentifyLanguage() { + async #identifyPageLanguage() { if (this.languageState?.detectedLanguages?.identified) { return this.languageState.detectedLanguages.identified; } - return this.sendQuery("Translations:IdentifyLanguage").catch(error => { - if (this.#isDestroyed) { - // The actor was destroyed while this message was still being resolved. - return null; - } - return Promise.reject(error); - }); + const actor = + this.browsingContext?.currentWindowGlobal?.getActor("PageExtractor"); + + if (!actor) { + throw new Error("Unable to get the PageExtractor actor."); + } + + const startTime = ChromeUtils.now(); + + const pageText = await actor.getText(); + if (this.#isDestroyed) { + return { language: "", confident: false, languages: [] }; + } + + const result = await lazy.LanguageDetector.detectLanguage(pageText); + if (this.#isDestroyed) { + return { language: "", confident: false, languages: [] }; + } + + const message = `Identified page language as "${result.language}": ${this.browsingContext?.currentURI?.spec}`; + ChromeUtils.addProfilerMarker( + "TranslationsParent", + { startTime, innerWindowId: this.innerWindowId }, + message + ); + lazy.console.debug(message); + + if (pageText.length < TranslationsParent.#DOC_CONFIDENCE_THRESHOLD) { + result.confident = false; + } + + return result; } /** @@ -3526,7 +3576,7 @@ export class TranslationsParent extends JSWindowActorParent { // Do a final check that the identified language matches the reported language // tag to ensure that the page isn't reporting the incorrect languages. This // check is deferred to now for performance considerations. - langTags.identified = await this.queryIdentifyLanguage(); + langTags.identified = await this.#identifyPageLanguage(); langTags.docLangTag = langTags.identified.language; if ( @@ -3760,7 +3810,7 @@ export class TranslationsParent extends JSWindowActorParent { if (!langTags.docLangTag) { // If the document's markup had no specified langTag, attempt to identify the // page's language. - langTags.identified = await this.queryIdentifyLanguage(); + langTags.identified = await this.#identifyPageLanguage(); langTags.docLangTag = langTags.identified.language; maybeNormalizeDocLangTag(); langTags.identified.language = langTags.docLangTag;