tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

commit a10da7a0320d634a50e797a4b6b1330709d8fde6
parent d57c8879128d3837d4271e02660422099c48e9ba
Author: Greg Tatum <tatum.creative@gmail.com>
Date:   Tue,  4 Nov 2025 20:14:10 +0000

Bug 1994839 - Add support for PDFs in the PageExtractor component; r=calixte

Differential Revision: https://phabricator.services.mozilla.com/D268949

Diffstat:
Mtoolkit/components/pageextractor/PageExtractorParent.sys.mjs | 12++++++++++++
Mtoolkit/components/pageextractor/tests/browser/browser.toml | 3+++
Atoolkit/components/pageextractor/tests/browser/browser_dom_extractor_pdf.js | 22++++++++++++++++++++++
Mtoolkit/components/pageextractor/tests/browser/head.js | 51+++++++++++++++++++++++++++++++++++++++++++++++++++
Atoolkit/components/pageextractor/tests/browser/page.pdf | 0
Mtoolkit/components/pdfjs/content/PdfStreamConverter.sys.mjs | 5+++++
Mtoolkit/components/pdfjs/content/PdfjsParent.sys.mjs | 46++++++++++++++++++++++++++++++++++++++++++++++
7 files changed, 139 insertions(+), 0 deletions(-)

diff --git a/toolkit/components/pageextractor/PageExtractorParent.sys.mjs b/toolkit/components/pageextractor/PageExtractorParent.sys.mjs @@ -37,6 +37,18 @@ export class PageExtractorParent extends JSWindowActorParent { * @returns {Promise<string | null>} */ getText(options = {}) { + if (this.#isPDF()) { + return this.browsingContext.currentWindowGlobal + .getActor("Pdfjs") + .getTextContent(); + } return this.sendQuery("PageExtractorParent:GetText", options); } + + #isPDF() { + return ( + this.browsingContext.currentWindowGlobal.documentPrincipal + .originNoSuffix == "resource://pdf.js" + ); + } } diff --git a/toolkit/components/pageextractor/tests/browser/browser.toml b/toolkit/components/pageextractor/tests/browser/browser.toml @@ -4,8 +4,11 @@ prefs = [ ] support-files = [ "head.js", + "page.pdf", ] ["browser_dom_extractor.js"] +["browser_dom_extractor_pdf.js"] + ["browser_dom_extractor_reader_mode.js"] diff --git a/toolkit/components/pageextractor/tests/browser/browser_dom_extractor_pdf.js b/toolkit/components/pageextractor/tests/browser/browser_dom_extractor_pdf.js @@ -0,0 +1,22 @@ +/* Any copyright is dedicated to the Public Domain. + https://creativecommons.org/publicdomain/zero/1.0/ */ + +"use strict"; + +add_task(async function test_dom_extractor_pdf() { + const { cleanup, getPageExtractor } = await openSupportFile("page.pdf"); + + is( + await getPageExtractor().getText(), + [ + "Etymology of Mochitests", + 'It\'s interesting that inside of Mozilla most people call mochitests "mohkee tests". I believe this is because it is', + 'adjacent to the term"mocha tests", which is pronounced with the hard k sound. However, thetesting', + 'infrastructure is named after the delicious Japanese treat knownas mochi. Mochi, pronounced like "moh', + 'chee" is a food that is made frompounding steamed rice into a soft elastic mass.', + ].join("\n"), + "Text is able to be extracted from the pdf." + ); + + return cleanup(); +}); diff --git a/toolkit/components/pageextractor/tests/browser/head.js b/toolkit/components/pageextractor/tests/browser/head.js @@ -1,6 +1,9 @@ /* Any copyright is dedicated to the Public Domain. https://creativecommons.org/publicdomain/zero/1.0/ */ +const BLANK_PAGE = + "data:text/html;charset=utf-8,<!DOCTYPE html><title>Blank</title>Blank page"; + /** * Use a tagged template literal to create a page extraction actor test. This spins * up an http server that serves the markup in a new tab. The page extractor can then @@ -129,3 +132,51 @@ function click(button, message) { } button.click(); } + +/** + * @param {string} file + */ +async function openSupportFile(file) { + // Support files can be served up from example.com + const url_prefix = "https://example.com/browser/"; + const path_prefix = "toolkit/components/pageextractor/tests/browser/"; + const url = url_prefix + path_prefix + file; + + // Start the tab at a blank page. + const tab = await BrowserTestUtils.openNewForegroundTab( + gBrowser, + BLANK_PAGE, + true // waitForLoad + ); + + BrowserTestUtils.startLoadingURIString(tab.linkedBrowser, url); + await BrowserTestUtils.browserLoaded( + tab.linkedBrowser, + /* includeSubFrames */ false, + url + ); + + async function cleanup() { + if (url.endsWith(".pdf")) { + // Wait for the PDFViewerApplication to be closed before removing the + // tab to avoid spurious errors and potential intermittents. + await SpecialPowers.spawn(tab.linkedBrowser, [], async () => { + const viewer = content.wrappedJSObject.PDFViewerApplication; + await viewer.testingClose(); + }); + } + BrowserTestUtils.removeTab(tab); + } + + return { + cleanup, + /** + * @returns {PageExtractorParent} + */ + getPageExtractor() { + return tab.linkedBrowser.browsingContext.currentWindowGlobal.getActor( + "PageExtractor" + ); + }, + }; +} diff --git a/toolkit/components/pageextractor/tests/browser/page.pdf b/toolkit/components/pageextractor/tests/browser/page.pdf Binary files differ. diff --git a/toolkit/components/pdfjs/content/PdfStreamConverter.sys.mjs b/toolkit/components/pdfjs/content/PdfStreamConverter.sys.mjs @@ -501,6 +501,11 @@ class ChromeActions { actor?.sendAsyncMessage("PDFJS:Parent:reportTelemetry", data); } + reportText(data) { + const actor = getActor(this.domWindow); + actor?.sendAsyncMessage("PDFJS:Parent:reportText", data); + } + updateFindControlState(data) { if (!this.supportsIntegratedFind()) { return; diff --git a/toolkit/components/pdfjs/content/PdfjsParent.sys.mjs b/toolkit/components/pdfjs/content/PdfjsParent.sys.mjs @@ -73,6 +73,15 @@ export class PdfjsParent extends JSWindowActorParent { "enableNewAltTextWhenAddingImage", ]); + #nextTextRequestId = 0; + + /** + * Holds the Promise resolves for getTextContent requests. + * + * @type {Map<number, (text: string) => void>} + */ + #textRequests = new Map(); + constructor() { super(); this._boundToFindbar = null; @@ -104,6 +113,8 @@ export class PdfjsParent extends JSWindowActorParent { return this._addEventListener(); case "PDFJS:Parent:saveURL": return this._saveURL(aMsg); + case "PDFJS:Parent:reportText": + return this._reportText(aMsg); case "PDFJS:Parent:recordExposure": return this._recordExposure(); case "PDFJS:Parent:reportTelemetry": @@ -132,6 +143,22 @@ export class PdfjsParent extends JSWindowActorParent { return this.browsingContext.top.embedderElement; } + /** + * Extracts the text content from a PDF. + * + * @returns {Promise<string>} + */ + getTextContent() { + const { promise, resolve } = Promise.withResolvers(); + const requestId = this.#nextTextRequestId++; + this.#textRequests.set(requestId, resolve); + this.sendAsyncMessage("PDFJS:Child:handleEvent", { + type: "requestTextContent", + detail: { requestId }, + }); + return promise; + } + async #openDatabase() { return lazy.IndexedDB.open(PDFJS_DB_NAME, PDFJS_DB_VERSION, db => { db.createObjectStore(PDFJS_STORE_NAME, { @@ -584,6 +611,25 @@ export class PdfjsParent extends JSWindowActorParent { }); } + /** + * Handle the response for extracting text. + * + * @param {{ data: { text: string, requestId: number } }} + */ + _reportText({ data }) { + const resolve = this.#textRequests.get(data.requestId); + this.#textRequests.delete(data.requestId); + if (!resolve) { + console.error( + "Unable to find the text content request", + data.requestId, + this.#textRequests + ); + return; + } + resolve(data.text); + } + _updateMatchesCount(aMsg) { let data = aMsg.data; let browser = this.browser;