commit a10da7a0320d634a50e797a4b6b1330709d8fde6
parent d57c8879128d3837d4271e02660422099c48e9ba
Author: Greg Tatum <tatum.creative@gmail.com>
Date: Tue, 4 Nov 2025 20:14:10 +0000
Bug 1994839 - Add support for PDFs in the PageExtractor component; r=calixte
Differential Revision: https://phabricator.services.mozilla.com/D268949
Diffstat:
7 files changed, 139 insertions(+), 0 deletions(-)
diff --git a/toolkit/components/pageextractor/PageExtractorParent.sys.mjs b/toolkit/components/pageextractor/PageExtractorParent.sys.mjs
@@ -37,6 +37,18 @@ export class PageExtractorParent extends JSWindowActorParent {
* @returns {Promise<string | null>}
*/
getText(options = {}) {
+ if (this.#isPDF()) {
+ return this.browsingContext.currentWindowGlobal
+ .getActor("Pdfjs")
+ .getTextContent();
+ }
return this.sendQuery("PageExtractorParent:GetText", options);
}
+
+ #isPDF() {
+ return (
+ this.browsingContext.currentWindowGlobal.documentPrincipal
+ .originNoSuffix == "resource://pdf.js"
+ );
+ }
}
diff --git a/toolkit/components/pageextractor/tests/browser/browser.toml b/toolkit/components/pageextractor/tests/browser/browser.toml
@@ -4,8 +4,11 @@ prefs = [
]
support-files = [
"head.js",
+ "page.pdf",
]
["browser_dom_extractor.js"]
+["browser_dom_extractor_pdf.js"]
+
["browser_dom_extractor_reader_mode.js"]
diff --git a/toolkit/components/pageextractor/tests/browser/browser_dom_extractor_pdf.js b/toolkit/components/pageextractor/tests/browser/browser_dom_extractor_pdf.js
@@ -0,0 +1,22 @@
+/* Any copyright is dedicated to the Public Domain.
+ https://creativecommons.org/publicdomain/zero/1.0/ */
+
+"use strict";
+
+add_task(async function test_dom_extractor_pdf() {
+ const { cleanup, getPageExtractor } = await openSupportFile("page.pdf");
+
+ is(
+ await getPageExtractor().getText(),
+ [
+ "Etymology of Mochitests",
+ 'It\'s interesting that inside of Mozilla most people call mochitests "mohkee tests". I believe this is because it is',
+ 'adjacent to the term"mocha tests", which is pronounced with the hard k sound. However, thetesting',
+ 'infrastructure is named after the delicious Japanese treat knownas mochi. Mochi, pronounced like "moh',
+ 'chee" is a food that is made frompounding steamed rice into a soft elastic mass.',
+ ].join("\n"),
+ "Text is able to be extracted from the pdf."
+ );
+
+ return cleanup();
+});
diff --git a/toolkit/components/pageextractor/tests/browser/head.js b/toolkit/components/pageextractor/tests/browser/head.js
@@ -1,6 +1,9 @@
/* Any copyright is dedicated to the Public Domain.
https://creativecommons.org/publicdomain/zero/1.0/ */
+const BLANK_PAGE =
+ "data:text/html;charset=utf-8,<!DOCTYPE html><title>Blank</title>Blank page";
+
/**
* Use a tagged template literal to create a page extraction actor test. This spins
* up an http server that serves the markup in a new tab. The page extractor can then
@@ -129,3 +132,51 @@ function click(button, message) {
}
button.click();
}
+
+/**
+ * @param {string} file
+ */
+async function openSupportFile(file) {
+ // Support files can be served up from example.com
+ const url_prefix = "https://example.com/browser/";
+ const path_prefix = "toolkit/components/pageextractor/tests/browser/";
+ const url = url_prefix + path_prefix + file;
+
+ // Start the tab at a blank page.
+ const tab = await BrowserTestUtils.openNewForegroundTab(
+ gBrowser,
+ BLANK_PAGE,
+ true // waitForLoad
+ );
+
+ BrowserTestUtils.startLoadingURIString(tab.linkedBrowser, url);
+ await BrowserTestUtils.browserLoaded(
+ tab.linkedBrowser,
+ /* includeSubFrames */ false,
+ url
+ );
+
+ async function cleanup() {
+ if (url.endsWith(".pdf")) {
+ // Wait for the PDFViewerApplication to be closed before removing the
+ // tab to avoid spurious errors and potential intermittents.
+ await SpecialPowers.spawn(tab.linkedBrowser, [], async () => {
+ const viewer = content.wrappedJSObject.PDFViewerApplication;
+ await viewer.testingClose();
+ });
+ }
+ BrowserTestUtils.removeTab(tab);
+ }
+
+ return {
+ cleanup,
+ /**
+ * @returns {PageExtractorParent}
+ */
+ getPageExtractor() {
+ return tab.linkedBrowser.browsingContext.currentWindowGlobal.getActor(
+ "PageExtractor"
+ );
+ },
+ };
+}
diff --git a/toolkit/components/pageextractor/tests/browser/page.pdf b/toolkit/components/pageextractor/tests/browser/page.pdf
Binary files differ.
diff --git a/toolkit/components/pdfjs/content/PdfStreamConverter.sys.mjs b/toolkit/components/pdfjs/content/PdfStreamConverter.sys.mjs
@@ -501,6 +501,11 @@ class ChromeActions {
actor?.sendAsyncMessage("PDFJS:Parent:reportTelemetry", data);
}
+ reportText(data) {
+ const actor = getActor(this.domWindow);
+ actor?.sendAsyncMessage("PDFJS:Parent:reportText", data);
+ }
+
updateFindControlState(data) {
if (!this.supportsIntegratedFind()) {
return;
diff --git a/toolkit/components/pdfjs/content/PdfjsParent.sys.mjs b/toolkit/components/pdfjs/content/PdfjsParent.sys.mjs
@@ -73,6 +73,15 @@ export class PdfjsParent extends JSWindowActorParent {
"enableNewAltTextWhenAddingImage",
]);
+ #nextTextRequestId = 0;
+
+ /**
+ * Holds the Promise resolves for getTextContent requests.
+ *
+ * @type {Map<number, (text: string) => void>}
+ */
+ #textRequests = new Map();
+
constructor() {
super();
this._boundToFindbar = null;
@@ -104,6 +113,8 @@ export class PdfjsParent extends JSWindowActorParent {
return this._addEventListener();
case "PDFJS:Parent:saveURL":
return this._saveURL(aMsg);
+ case "PDFJS:Parent:reportText":
+ return this._reportText(aMsg);
case "PDFJS:Parent:recordExposure":
return this._recordExposure();
case "PDFJS:Parent:reportTelemetry":
@@ -132,6 +143,22 @@ export class PdfjsParent extends JSWindowActorParent {
return this.browsingContext.top.embedderElement;
}
+ /**
+ * Extracts the text content from a PDF.
+ *
+ * @returns {Promise<string>}
+ */
+ getTextContent() {
+ const { promise, resolve } = Promise.withResolvers();
+ const requestId = this.#nextTextRequestId++;
+ this.#textRequests.set(requestId, resolve);
+ this.sendAsyncMessage("PDFJS:Child:handleEvent", {
+ type: "requestTextContent",
+ detail: { requestId },
+ });
+ return promise;
+ }
+
async #openDatabase() {
return lazy.IndexedDB.open(PDFJS_DB_NAME, PDFJS_DB_VERSION, db => {
db.createObjectStore(PDFJS_STORE_NAME, {
@@ -584,6 +611,25 @@ export class PdfjsParent extends JSWindowActorParent {
});
}
+ /**
+ * Handle the response for extracting text.
+ *
+ * @param {{ data: { text: string, requestId: number } }}
+ */
+ _reportText({ data }) {
+ const resolve = this.#textRequests.get(data.requestId);
+ this.#textRequests.delete(data.requestId);
+ if (!resolve) {
+ console.error(
+ "Unable to find the text content request",
+ data.requestId,
+ this.#textRequests
+ );
+ return;
+ }
+ resolve(data.text);
+ }
+
_updateMatchesCount(aMsg) {
let data = aMsg.data;
let browser = this.browser;