commit 2960b00b8ff9c65df18fa61ba23020d696894cdd
parent 38f389d8292b0481625dae12543e725f389b8aa7
Author: Greg Tatum <tatum.creative@gmail.com>
Date: Sat, 18 Oct 2025 02:16:52 +0000
Bug 1994183 - Support about:reader in the PageExtractor; r=nordzilla,Gijs,translations-reviewers
Differential Revision: https://phabricator.services.mozilla.com/D268902
Diffstat:
6 files changed, 183 insertions(+), 4 deletions(-)
diff --git a/browser/components/translations/tests/browser/head.js b/browser/components/translations/tests/browser/head.js
@@ -329,14 +329,22 @@ async function switchTab(tab, name) {
async function toggleReaderMode() {
logAction();
const readerButton = document.getElementById("reader-mode-button");
- await waitForCondition(() => readerButton.hidden === false);
+ await BrowserTestUtils.waitForMutationCondition(
+ readerButton,
+ { attributes: true, attributeFilter: ["hidden"] },
+ () => readerButton.hidden === false
+ );
readerButton.getAttribute("readeractive")
? info("Exiting reader mode")
: info("Entering reader mode");
const readyPromise = readerButton.getAttribute("readeractive")
- ? waitForCondition(() => !readerButton.getAttribute("readeractive"))
+ ? BrowserTestUtils.waitForMutationCondition(
+ readerButton,
+ { attributes: true, attributeFilter: ["readeractive"] },
+ () => !readerButton.getAttribute("readeractive")
+ )
: BrowserTestUtils.waitForContentEvent(
gBrowser.selectedBrowser,
"AboutReaderContentReady"
diff --git a/toolkit/components/pageextractor/PageExtractorChild.sys.mjs b/toolkit/components/pageextractor/PageExtractorChild.sys.mjs
@@ -52,8 +52,14 @@ export class PageExtractorChild extends JSWindowActorChild {
async receiveMessage({ name, data }) {
switch (name) {
case "PageExtractorParent:GetReaderModeContent":
+ if (this.isAboutReader()) {
+ return this.getAboutReaderContent();
+ }
return this.getReaderModeContent(data);
case "PageExtractorParent:GetText":
+ if (this.isAboutReader()) {
+ return this.getAboutReaderContent();
+ }
return this.getText(data);
}
return Promise.reject(new Error("Unknown message: " + name));
@@ -82,11 +88,14 @@ export class PageExtractorChild extends JSWindowActorChild {
return "";
}
- const text = (article?.textContent || "")
+ let text = (article?.textContent || "")
.trim()
// Replace duplicate whitespace with either a single newline or space
.replace(/(\s*\n\s*)|\s{2,}/g, (_, newline) => (newline ? "\n" : " "));
+ if (article.title) {
+ text = article.title + "\n\n" + text;
+ }
lazy.console.log("GetReaderModeContent", { force });
lazy.console.debug(text);
@@ -122,4 +131,48 @@ export class PageExtractorChild extends JSWindowActorChild {
return text.trim();
}
+
+ /**
+ * Special case extracting text from Reader Mode. The original article content is not
+ * retained once reader mode is activated. It is rendered out to the page. Rather
+ * than cache an additional copy of the article, just extract the text from the
+ * actual reader mode DOM.
+ *
+ * @returns {string | null}
+ */
+ getAboutReaderContent() {
+ lazy.console.log("Using special text extraction strategy for about:reader");
+ const document = this.manager.contentWindow.document;
+
+ if (!document) {
+ return null;
+ }
+ /** @type {HTMLElement?} */
+ const titleEl = document.querySelector(".reader-title");
+ /** @type {HTMLElement?} */
+ const contentEl = document.querySelector(".moz-reader-content");
+
+ const title = titleEl?.innerText;
+ const content = contentEl?.innerText;
+ if (!title && !content) {
+ return null;
+ }
+
+ if (title) {
+ return `${title}\n\n${content}`.trim();
+ }
+ return content.trim();
+ }
+
+ /**
+ * Checks if about:reader is loaded, which requires special handling.
+ *
+ * @returns {boolean}
+ */
+ isAboutReader() {
+ // Accessing the documentURIObject in this way does not materialize the
+ // `window.location.href` and should be a cheaper check here.
+ let url = this.manager.contentWindow.document.documentURIObject;
+ return url.schemeIs("about") && url.pathQueryRef.startsWith("reader?");
+ }
}
diff --git a/toolkit/components/pageextractor/tests/browser/browser.toml b/toolkit/components/pageextractor/tests/browser/browser.toml
@@ -7,3 +7,5 @@ support-files = [
]
["browser_dom_extractor.js"]
+
+["browser_dom_extractor_reader_mode.js"]
diff --git a/toolkit/components/pageextractor/tests/browser/browser_dom_extractor_reader_mode.js b/toolkit/components/pageextractor/tests/browser/browser_dom_extractor_reader_mode.js
@@ -0,0 +1,62 @@
+/* Any copyright is dedicated to the Public Domain.
+ https://creativecommons.org/publicdomain/zero/1.0/ */
+
+"use strict";
+
+/**
+ * There is some inconsistency in newline handling between the modes. Make all newlines
+ * collapse to just spaces.
+ *
+ * @param {string} text
+ */
+function normalizeWhitespace(text) {
+ return text.replaceAll("\n\n", "\n").replaceAll("\n", " ");
+}
+
+add_task(async function test_dom_extractor_reader_mode() {
+ const title = "Etymology of Mochitests";
+ const article =
+ `It's interesting that inside of Mozilla most people call mochitests "moh` +
+ `kee tests". I believe this is because it is adjacent to the term` +
+ `"mocha tests", which is pronounced with the hard k sound. However, the` +
+ `testing infrastructure is named after the delicious Japanese treat known` +
+ `as mochi. Mochi, pronounced like "moh chee" is a food that is made from` +
+ `pounding steamed rice into a soft elastic mass.`;
+
+ const { cleanup, getPageExtractor } = await html`
+ <article>
+ <h1>${title}</h1>
+ <p>${article}</p>
+ </article>
+ `;
+
+ const text = `${title} ${article}`;
+
+ is(
+ normalizeWhitespace(await getPageExtractor().getText()),
+ text,
+ "Normal page content supports getText"
+ );
+
+ is(
+ normalizeWhitespace(await getPageExtractor().getReaderModeContent()),
+ text,
+ "Normal page content supports getReaderModeContent"
+ );
+
+ await toggleReaderMode();
+
+ is(
+ normalizeWhitespace(await getPageExtractor().getText()),
+ text,
+ "about:reader is supported with getText"
+ );
+
+ is(
+ normalizeWhitespace(await getPageExtractor().getReaderModeContent()),
+ text,
+ "about:reader is supported with getReaderModeContent"
+ );
+
+ await cleanup();
+});
diff --git a/toolkit/components/pageextractor/tests/browser/head.js b/toolkit/components/pageextractor/tests/browser/head.js
@@ -29,14 +29,28 @@ async function html(strings, ...values) {
true // waitForLoad
);
- /** @type {PageExtractorParent} */
const actor =
tab.linkedBrowser.browsingContext.currentWindowGlobal.getActor(
"PageExtractor"
);
return {
+ /**
+ * @type {PageExtractorParent}
+ */
actor,
+
+ /**
+ * Get a new page extractor, which can change when navigating pages.
+ *
+ * @returns {PageExtractorParent}
+ */
+ getPageExtractor() {
+ return tab.linkedBrowser.browsingContext.currentWindowGlobal.getActor(
+ "PageExtractor"
+ );
+ },
+
async cleanup() {
info("Cleaning up");
await serverClosed;
@@ -76,3 +90,42 @@ function serveOnce(html) {
return { url, serverClosed: promise };
}
+
+/**
+ * Click the reader-mode button if the reader-mode button is available.
+ * Fails if the reader-mode button is hidden.
+ */
+async function toggleReaderMode() {
+ const readerButton = document.getElementById("reader-mode-button");
+ await BrowserTestUtils.waitForMutationCondition(
+ readerButton,
+ { attributes: true, attributeFilter: ["hidden"] },
+ () => readerButton.hidden === false
+ );
+
+ readerButton.getAttribute("readeractive")
+ ? info("Exiting reader mode")
+ : info("Entering reader mode");
+
+ const readyPromise = readerButton.getAttribute("readeractive")
+ ? BrowserTestUtils.waitForMutationCondition(
+ readerButton,
+ { attributes: true, attributeFilter: ["readeractive"] },
+ () => !readerButton.getAttribute("readeractive")
+ )
+ : BrowserTestUtils.waitForContentEvent(
+ gBrowser.selectedBrowser,
+ "AboutReaderContentReady"
+ );
+
+ click(readerButton, "Clicking the reader-mode button");
+ await readyPromise;
+}
+
+function click(button, message) {
+ info(message);
+ if (button.hidden) {
+ throw new Error("The button was hidden when trying to click it.");
+ }
+ button.click();
+}
diff --git a/toolkit/modules/ActorManagerParent.sys.mjs b/toolkit/modules/ActorManagerParent.sys.mjs
@@ -482,6 +482,7 @@ let JSWINDOWACTORS = {
"file:///*",
"moz-extension://*",
"data:text/html,*",
+ "about:reader?*",
],
messageManagerGroups: ["browsers"],
},