[ tor-browser ].git.dasho

commit 38f389d8292b0481625dae12543e725f389b8aa7
parent c14535a918527d5324c7224f1782bbd7c9ec7b7f
Author: Erik Nordin <enordin@mozilla.com>
Date:   Sat, 18 Oct 2025 02:16:52 +0000

Bug 1967758 - Add sufficientLength option to DOM text extractor r=translations-reviewers,gregtatum

This patch adds a `sufficientLength` option to the DOM text extractor
logic, which will bail out of text extraction if the extracted text
has surpassed the provided length in code units.

This saves significant resources if the purpose of the extraction does
not need to extract the entirety of the document's text, such as for
Translations language identification.

Differential Revision: https://phabricator.services.mozilla.com/D268284

Diffstat:
M toolkit/components/pageextractor/DOMExtractor.sys.mjs  | 30 ++++++++++++++++++++++++++++++
M toolkit/components/pageextractor/PageExtractor.d.ts  | 8 ++++++--
M toolkit/components/pageextractor/tests/browser/browser_dom_extractor.js  | 50 +++++++++++++++++++++++++++++++++++++++++++++++++-
M toolkit/components/translations/actors/TranslationsParent.sys.mjs  | 14 ++++++++++++--

4 files changed, 97 insertions(+), 5 deletions(-)
diff --git a/toolkit/components/pageextractor/DOMExtractor.sys.mjs b/toolkit/components/pageextractor/DOMExtractor.sys.mjs
@@ -52,6 +52,25 @@ class ExtractionContext {
   }
 
   /**
+   * Returns true if a condition has been met such that the text
+   * extraction should stop early, otherwise false.
+   *
+   * @returns {boolean}
+   */
+  shouldStopExtraction() {
+    const { sufficientLength } = this.#options;
+
+    if (
+      sufficientLength !== undefined &&
+      this.#textContent.length >= sufficientLength
+    ) {
+      return true;
+    }
+
+    return false;
+  }
+
+  /**
    * Returns true if this node or its ancestor's text content has
    * already been extracted from the DOM.
    *
@@ -410,6 +429,10 @@ function hasNonWhitespaceTextNodes(node) {
  * @param {ExtractionContext} context
  */
 function subdivideAndExtractText(node, context) {
+  if (context.shouldStopExtraction()) {
+    return;
+  }
+
   switch (determineBlockStatus(node)) {
     case NodeFilter.FILTER_REJECT: {
       // This node is rejected as it shouldn't be used for text extraction.
@@ -449,6 +472,10 @@ function subdivideAndExtractText(node, context) {
  * @param {ExtractionContext} context
  */
 function processSubdivide(node, context) {
+  if (context.shouldStopExtraction()) {
+    return;
+  }
+
   const { ownerDocument } = node;
   if (!ownerDocument) {
     return;
@@ -470,6 +497,9 @@ function processSubdivide(node, context) {
     } else {
       context.maybeAppendTextContent(currentNode);
     }
+    if (context.shouldStopExtraction()) {
+      return;
+    }
   }
 }
 
diff --git a/toolkit/components/pageextractor/PageExtractor.d.ts b/toolkit/components/pageextractor/PageExtractor.d.ts
@@ -2,9 +2,13 @@
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
-export interface GetTextOptions {
+export type GetTextOptions = Partial<{
+  // The length of extracted text that is sufficient for the purpose.
+  // When set, extraction will stop when the text meets or exceeds this length.
+  // When unset, the lenghth of the extracted text is unbounded.
+  sufficientLength: number;
   // Remove menus and other boilerplate.
   removeBoilerplate: boolean;
   // Just include the viewport content.
   justViewport: boolean;
-}
+}>;
diff --git a/toolkit/components/pageextractor/tests/browser/browser_dom_extractor.js b/toolkit/components/pageextractor/tests/browser/browser_dom_extractor.js
@@ -8,7 +8,7 @@
  * @import { PageExtractorParent } from "../../PageExtractorParent.sys.mjs"
  */
 
-add_task(async function test_dom_extractor() {
+add_task(async function test_dom_extractor_default_options() {
   const { actor, cleanup } = await html`
     <article>
       <h1>Hello World</h1>
@@ -35,3 +35,51 @@ add_task(async function test_dom_extractor() {
   );
   return cleanup();
 });
+
+add_task(async function test_dom_extractor_sufficient_length_option() {
+  const { actor, cleanup } = await html`
+    <article>
+      <h1>Hello World</h1>
+      <p>First paragraph.</p>
+      <p>Second paragraph.</p>
+    </article>
+  `;
+
+  const header = "Hello World";
+  const headerAndP1 = ["Hello World", "First paragraph."].join("\n");
+  const allText = ["Hello World", "First paragraph.", "Second paragraph."].join(
+    "\n"
+  );
+
+  is(
+    await actor.getText(),
+    allText,
+    "All text is returned with the default options."
+  );
+
+  const max = allText.length + 1;
+  const expectations = [
+    [length => length === 0, ""],
+    [length => length > 0 && length <= 12, header],
+    [length => length > 12 && length <= 29, headerAndP1],
+    [length => length > 29 && length <= max, allText],
+  ];
+
+  for (let sufficientLength = 0; sufficientLength <= max; ++sufficientLength) {
+    let expectedValue;
+
+    for (const [predicate, value] of expectations) {
+      if (predicate(sufficientLength)) {
+        expectedValue = value;
+      }
+    }
+
+    is(
+      await actor.getText({ sufficientLength }),
+      expectedValue,
+      `The text, given sufficientLength of ${sufficientLength}, matches the expectation.`
+    );
+  }
+
+  return cleanup();
+});
diff --git a/toolkit/components/translations/actors/TranslationsParent.sys.mjs b/toolkit/components/translations/actors/TranslationsParent.sys.mjs
@@ -480,7 +480,7 @@ export class TranslationsParent extends JSWindowActorParent {
    * This is the worst-case scenario where we will start scraping
    * the page text even if it has not yet fully loaded.
    */
-  static #REACT_TO_PAGE_LANGUAGE_TIMEOUT = 2000;
+  static #REACT_TO_PAGE_LANGUAGE_TIMEOUT = 500;
 
   /**
    * A race that determines when to react to to the page's language tag.
@@ -3572,7 +3572,17 @@ export class TranslationsParent extends JSWindowActorParent {
 
     const startTime = ChromeUtils.now();
 
-    const pageText = await actor.getText();
+    // Manual profiling on 10 page loads of https://es.wikipedia.org/wiki/Felis_catus:
+    // -------------------------------------------------------------------------------
+    //
+    //   No limit: 2064 samples, 224/237/294 [min/med/max]ms (~85k code units)
+    // 8192 limit:  681 samples,  75/ 87/128 [min/med/max]ms
+    // 4096 limit:  457 samples,  51/ 55/ 97 [min/med/max]ms
+    // 2048 limit:  240 samples,  29/ 39/ 64 [min/med/max]ms
+    // 1024 limit:  142 samples,  19/ 28/ 58 [min/med/max]ms
+    //
+    // 2048 Code units feels like a decent length for performance and sample size.
+    const pageText = await actor.getText({ sufficientLength: 2048 });
     if (this.#isDestroyed) {
       return { language: "", confident: false, languages: [] };
     }

	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE

M	toolkit/components/pageextractor/DOMExtractor.sys.mjs	\|	30	++++++++++++++++++++++++++++++
M	toolkit/components/pageextractor/PageExtractor.d.ts	\|	8	++++++--
M	toolkit/components/pageextractor/tests/browser/browser_dom_extractor.js	\|	50	+++++++++++++++++++++++++++++++++++++++++++++++++-
M	toolkit/components/translations/actors/TranslationsParent.sys.mjs	\|	14	++++++++++++--