tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

commit 2db2ad618c5d8148357fdd8d720f8c2225dfdc79
parent 77d5e074eecb55cdb3d568a284f65913add22ee2
Author: Cristina Horotan <chorotan@mozilla.com>
Date:   Thu,  9 Oct 2025 01:19:36 +0300

Revert "Bug 1967758 - Utilize PageExtractor for Translations language detection r=translations-reviewers,gregtatum" on request

This reverts commit 421732532b12b44076f9e7b894c7d83e728c0f74.

Revert "Bug 1967758 - Remove unused identifiedLangTag entry r=translations-reviewers,gregtatum"

This reverts commit bb6366e8e23f342a0750bb300f97feeecb8be2cc.

Revert "Bug 1967758 - Remove LanguageDetector.detectLanguageFromDocument r=translations-reviewers,gregtatum"

This reverts commit bcbd638c2a3b0f267f98c39e8ced0306a9ebf51f.

Diffstat:
Mbrowser/components/translations/tests/browser/browser_translations_full_page_language_id_behavior.js | 54------------------------------------------------------
Mtoolkit/components/translations/LanguageDetector.sys.mjs | 76++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
Mtoolkit/components/translations/actors/TranslationsChild.sys.mjs | 29+++++++++++++++++++++++++++++
Mtoolkit/components/translations/actors/TranslationsParent.sys.mjs | 77+++++++++++++----------------------------------------------------------------
4 files changed, 116 insertions(+), 120 deletions(-)

diff --git a/browser/components/translations/tests/browser/browser_translations_full_page_language_id_behavior.js b/browser/components/translations/tests/browser/browser_translations_full_page_language_id_behavior.js @@ -296,57 +296,3 @@ add_task(async function test_language_identification_behavior() { await cleanup(); } }); - -/** - * This test case tests the behavior when the page has no declared language - * tag and the detected language is not supported by Translations. - */ -add_task(async function test_detected_language_unsupported() { - info("Testing unsupported detected language with no declared language"); - TranslationsParent.testAutomaticPopup = true; - - let wasPopupShown = false; - window.FullPageTranslationsPanel.elements; // De-lazify the panel. - - const { resolve } = Promise.withResolvers(); - const panel = window.document.getElementById("full-page-translations-panel"); - - function handlePopupShown() { - wasPopupShown = true; - panel.removeEventListener("popupshown", handlePopupShown); - resolve(); - } - panel.addEventListener("popupshown", handlePopupShown); - - const { cleanup, runInPage } = await loadTestPage({ - page: SPANISH_PAGE_UNDECLARED_URL, - // Deliberately omit Spanish so that it is not supported. - languagePairs: [ - { fromLang: "en", toLang: "fr" }, - { fromLang: "fr", toLang: "en" }, - { fromLang: "en", toLang: "uk" }, - { fromLang: "uk", toLang: "en" }, - ], - autoDownloadFromRemoteSettings: true, - contentEagerMode: true, - }); - - await FullPageTranslationsTestUtils.assertTranslationsButton( - { button: false }, - "The translations button is not visible when the detected language is unsupported." - ); - - await FullPageTranslationsTestUtils.assertPageIsNotTranslated( - runInPage, - "No translation should occur when the detected language is unsupported." - ); - - is( - wasPopupShown, - false, - "A translation was not offered for an unsupported detected language." - ); - - TranslationsParent.testAutomaticPopup = false; - await cleanup(); -}); diff --git a/toolkit/components/translations/LanguageDetector.sys.mjs b/toolkit/components/translations/LanguageDetector.sys.mjs @@ -11,6 +11,54 @@ const WORKER_URL = "resource://gre/modules/translations/cld-worker.js"; */ /** + * The options used for when detecting a language. + * + * @typedef {object} DetectionOptions + * + * @property {string} text - The text to analyze. + * @property {boolean} [isHTML] - A boolean, indicating whether the text should be analyzed as + * HTML rather than plain text. + * @property {string} [language] - A string indicating the expected language. For text + * extracted from HTTP documents, this is expected to come from the Content-Language + * header. + * @property {string} [tld] - A string indicating the top-level domain of the document the + * text was extracted from. + * @property {string} [encoding] - A string describing the encoding of the document the + * string was extracted from. Note that, regardless of the value of this property, + * the 'text' property must be a UTF-16 JavaScript string. + */ + +/** + * The length of the substring to pull from the document's text for language + * identification. + * + * This value should ideally be one that is large enough to yield a confident + * identification result without being too large or expensive to extract. + * + * At this time, this value is not driven by statistical data or analysis. + */ +const DOC_TEXT_TO_IDENTIFY_LENGTH = 1024; + +/** + * The shorter the text, the less confidence we should have in the result of the language + * identification. Add another heuristic to report the ID as not confident if the length + * of the code points of the text is less than this threshold. + * + * This was determined by plotting a kernel density estimation of the number of times the + * source language had to be changed in the SelectTranslationsPanel vs. the code units in + * the source text. + * + * 0013 code units or less - 49.5% of language changes + * 0036 code units or less - 74.9% of language changes + * 0153 code units or less - 90.0% of language changes + * 0200 code units or less - 91.5% of language changes + * 0427 code units or less - 95.0% of language changes + * 1382 code units or less - 98.0% of language changes + * 3506 code units or less - 99.0% of language changes + */ +const DOC_CONFIDENCE_THRESHOLD = 200; + +/** * An internal class to manage communicating to the worker, and managing its lifecycle. * It's initialized once below statically to the module. */ @@ -136,8 +184,7 @@ class WorkerManager { export const workerManager = new WorkerManager(); /** - * This class provides the ability to identify the language of text using - * the CLD2 language-detection algorithm. + * */ export class LanguageDetector { /** @@ -154,4 +201,29 @@ export class LanguageDetector { return workerManager.detectLanguage(options); } + + /** + * Attempts to determine the language in which the document's content is written. + * + * @param {Document} document + * @returns {Promise<DetectionResult>} + */ + static async detectLanguageFromDocument(document) { + // Grab a selection of text. + let encoder = Cu.createDocumentEncoder("text/plain"); + encoder.init(document, "text/plain", encoder.SkipInvisibleContent); + let text = encoder + .encodeToStringWithMaxLength(DOC_TEXT_TO_IDENTIFY_LENGTH) + .replaceAll("\r", "") + .replaceAll("\n", " "); + + const result = await workerManager.detectLanguage({ + text, + }); + + if (text.length < DOC_CONFIDENCE_THRESHOLD) { + result.confident = false; + } + return result; + } } diff --git a/toolkit/components/translations/actors/TranslationsChild.sys.mjs b/toolkit/components/translations/actors/TranslationsChild.sys.mjs @@ -8,6 +8,8 @@ ChromeUtils.defineESModuleGetters(lazy, { "chrome://global/content/translations/translations-document.sys.mjs", LRUCache: "chrome://global/content/translations/translations-document.sys.mjs", + LanguageDetector: + "resource://gre/modules/translations/LanguageDetector.sys.mjs", }); /** @@ -115,6 +117,33 @@ export class TranslationsChild extends JSWindowActorChild { case "Translations:GetDocumentElementLang": { return this.document.documentElement.lang; } + case "Translations:IdentifyLanguage": { + // Wait for idle callback as the page will be more settled if it has + // dynamic content, like on a React app. + if (this.contentWindow) { + await new Promise(resolve => { + this.contentWindow.requestIdleCallback(resolve); + }); + } + + if (this.#isDestroyed) { + return undefined; + } + + const startTime = ChromeUtils.now(); + const detectionResult = + await lazy.LanguageDetector.detectLanguageFromDocument(this.document); + + if (this.#isDestroyed) { + return undefined; + } + + this.addProfilerMarker( + `Detect language from document: ${detectionResult.language}`, + startTime + ); + return detectionResult; + } case "Translations:AcquirePort": { this.addProfilerMarker("Acquired a port, resuming translations"); this.#translatedDoc.acquirePort(data.port); diff --git a/toolkit/components/translations/actors/TranslationsParent.sys.mjs b/toolkit/components/translations/actors/TranslationsParent.sys.mjs @@ -81,8 +81,6 @@ XPCOMUtils.defineLazyServiceGetters(lazy, { }); ChromeUtils.defineESModuleGetters(lazy, { - LanguageDetector: - "resource://gre/modules/translations/LanguageDetector.sys.mjs", RemoteSettings: "resource://services-settings/remote-settings.sys.mjs", setTimeout: "resource://gre/modules/Timer.sys.mjs", TranslationsTelemetry: @@ -447,27 +445,6 @@ export class TranslationsParent extends JSWindowActorParent { static LANGUAGE_MODEL_MAJOR_VERSION_MAX = 3; /** - * The shorter the text, the less confidence we should have in the result of the language - * identification. Add another heuristic to report the ID as not confident if the length - * of the code points of the text is less than this threshold. - * - * This was determined by plotting a kernel density estimation of the number of times the - * source language had to be changed in the SelectTranslationsPanel vs. the code units in - * the source text. - * - * 0013 code units or less - 49.5% of language changes - * 0036 code units or less - 74.9% of language changes - * 0153 code units or less - 90.0% of language changes - * 0200 code units or less - 91.5% of language changes - * 0427 code units or less - 95.0% of language changes - * 1382 code units or less - 98.0% of language changes - * 3506 code units or less - 99.0% of language changes - * - * @type {number} - */ - static #DOC_CONFIDENCE_THRESHOLD = 150; - - /** * Contains the state that would affect UI. Anytime this state is changed, a dispatch * event is sent so that UI can react to it. The actor is inside of /toolkit and * needs a way of notifying /browser code (or other users) of when the state changes. @@ -818,7 +795,7 @@ export class TranslationsParent extends JSWindowActorParent { // popup will not be shown. if (detectedLanguages.htmlLangAttribute && !detectedLanguages.identified) { // Compare language langTagsMatch - detectedLanguages.identified = await this.#identifyPageLanguage(); + detectedLanguages.identified = await this.queryIdentifyLanguage(); if ( !lazy.TranslationsUtils.langTagsMatch( @@ -867,7 +844,7 @@ export class TranslationsParent extends JSWindowActorParent { if ( !TranslationsParent.findCompatibleSourceLangTagSync( - detectedLanguages.identified.language, + detectedLanguages.identifiedLangTag, await TranslationsParent.getNonPivotLanguagePairs() ) ) { @@ -3493,11 +3470,9 @@ export class TranslationsParent extends JSWindowActorParent { } /** - * Uses the page extractor to identify the current page's language. - * * @returns {Promise<DetectionResult>} */ - async #identifyPageLanguage() { + async queryIdentifyLanguage() { if ( TranslationsParent.isInAutomation() && !TranslationsParent.#isTranslationsEngineMocked @@ -3505,39 +3480,13 @@ export class TranslationsParent extends JSWindowActorParent { // In automation assume English is the language, but don't be confident. return { confident: false, language: "en", languages: [] }; } - - const actor = - this.browsingContext?.currentWindowGlobal?.getActor("PageExtractor"); - - if (!actor) { - throw new Error("Unable to get the PageExtractor actor."); - } - - const startTime = ChromeUtils.now(); - - const pageText = await actor.getText(); - if (this.#isDestroyed) { - return { language: "", confident: false, languages: [] }; - } - - const result = await lazy.LanguageDetector.detectLanguage(pageText); - if (this.#isDestroyed) { - return { language: "", confident: false, languages: [] }; - } - - const message = `Identified page language as "${result.language}": ${this.browsingContext?.currentURI?.spec}`; - ChromeUtils.addProfilerMarker( - "TranslationsParent", - { startTime, innerWindowId: this.innerWindowId }, - message - ); - lazy.console.debug(message); - - if (pageText.length < TranslationsParent.#DOC_CONFIDENCE_THRESHOLD) { - result.confident = false; - } - - return result; + return this.sendQuery("Translations:IdentifyLanguage").catch(error => { + if (this.#isDestroyed) { + // The actor was destroyed while this message was still being resolved. + return null; + } + return Promise.reject(error); + }); } /** @@ -3580,7 +3529,7 @@ export class TranslationsParent extends JSWindowActorParent { // Do a final check that the identified language matches the reported language // tag to ensure that the page isn't reporting the incorrect languages. This // check is deferred to now for performance considerations. - langTags.identified = await this.#identifyPageLanguage(); + langTags.identified = await this.queryIdentifyLanguage(); langTags.docLangTag = langTags.identified.language; if ( @@ -3763,7 +3712,7 @@ export class TranslationsParent extends JSWindowActorParent { userLangTag: null, isDocLangTagSupported: false, htmlLangAttribute: htmlLangAttribute ?? null, - identified: null, + identifiedLangTag: null, }; /** @@ -3814,7 +3763,7 @@ export class TranslationsParent extends JSWindowActorParent { if (!langTags.docLangTag) { // If the document's markup had no specified langTag, attempt to identify the // page's language. - langTags.identified = await this.#identifyPageLanguage(); + langTags.identified = await this.queryIdentifyLanguage(); langTags.docLangTag = langTags.identified.language; maybeNormalizeDocLangTag(); langTags.identified.language = langTags.docLangTag;