PageDataService.sys.mjs (14888B)
1 /* This Source Code Form is subject to the terms of the Mozilla Public 2 * License, v. 2.0. If a copy of the MPL was not distributed with this 3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 4 5 import { XPCOMUtils } from "resource://gre/modules/XPCOMUtils.sys.mjs"; 6 7 import { EventEmitter } from "resource://gre/modules/EventEmitter.sys.mjs"; 8 9 const lazy = {}; 10 11 ChromeUtils.defineESModuleGetters(lazy, { 12 BrowserWindowTracker: "resource:///modules/BrowserWindowTracker.sys.mjs", 13 HiddenBrowserManager: "resource://gre/modules/HiddenFrame.sys.mjs", 14 }); 15 16 ChromeUtils.defineLazyGetter(lazy, "logConsole", function () { 17 return console.createInstance({ 18 prefix: "PageData", 19 maxLogLevel: Services.prefs.getBoolPref("browser.pagedata.log", false) 20 ? "Debug" 21 : "Warn", 22 }); 23 }); 24 25 XPCOMUtils.defineLazyServiceGetters(lazy, { 26 idleService: ["@mozilla.org/widget/useridleservice;1", Ci.nsIUserIdleService], 27 }); 28 29 XPCOMUtils.defineLazyPreferenceGetter( 30 lazy, 31 "fetchIdleTime", 32 "browser.pagedata.fetchIdleTime", 33 300 34 ); 35 36 const ALLOWED_PROTOCOLS = new Set(["http:", "https:", "data:", "blob:"]); 37 38 /** 39 * Shifts the first element out of the set. 40 * 41 * @param {Set<T>} set 42 * The set containing elements. 43 * @returns {T | undefined} The first element in the set or undefined if 44 * there is nothing in the set. 45 */ 46 function shift(set) { 47 let iter = set.values(); 48 let { value, done } = iter.next(); 49 50 if (done) { 51 return undefined; 52 } 53 54 set.delete(value); 55 return value; 56 } 57 58 /** 59 * @typedef {object} CacheEntry 60 * An entry in the page data cache. 61 * @property {PageData | null} pageData 62 * The data or null if there is no known data. 63 * @property {Set} actors 64 * The actors that maintain an interest in keeping the entry cached. 65 */ 66 67 /** 68 * A cache of page data kept in memory. By default any discovered data from 69 * browsers is kept in memory until the browser element is destroyed but other 70 * actors may register an interest in keeping an entry alive beyond that. 71 */ 72 class PageDataCache { 73 /** 74 * The contents of the cache. Keyed on page url. 75 * 76 * @type {Map<string, CacheEntry>} 77 */ 78 #cache = new Map(); 79 80 /** 81 * Creates or updates an entry in the cache. If no actor has registered any 82 * interest in keeping this page's data in memory then this will do nothing. 83 * 84 * @param {string} url 85 * The url of the page. 86 * @param {PageData|null} pageData 87 * The current page data for the page. 88 */ 89 set(url, pageData) { 90 let entry = this.#cache.get(url); 91 92 if (entry) { 93 entry.pageData = pageData; 94 } 95 } 96 97 /** 98 * Gets any cached data for the url. 99 * 100 * @param {string} url 101 * The url of the page. 102 * @returns {PageData | null} 103 * The page data if some is known. 104 */ 105 get(url) { 106 let entry = this.#cache.get(url); 107 return entry?.pageData ?? null; 108 } 109 110 /** 111 * Adds a lock to an entry. This can be called before we have discovered the 112 * data for the url. 113 * 114 * @param {object} actor 115 * Ensures the entry stays in memory until unlocked by this actor. 116 * @param {string} url 117 * The url of the page. 118 */ 119 lockData(actor, url) { 120 let entry = this.#cache.get(url); 121 if (entry) { 122 entry.actors.add(actor); 123 } else { 124 this.#cache.set(url, { 125 pageData: undefined, 126 actors: new Set([actor]), 127 }); 128 } 129 } 130 131 /** 132 * Removes a lock from an entry. 133 * 134 * @param {object} actor 135 * The lock to remove. 136 * @param {string | undefined} [url] 137 * The url of the page or undefined to unlock all urls locked by this actor. 138 */ 139 unlockData(actor, url) { 140 let entries = []; 141 if (url) { 142 let entry = this.#cache.get(url); 143 if (!entry) { 144 return; 145 } 146 147 entries.push([url, entry]); 148 } else { 149 entries = [...this.#cache]; 150 } 151 152 for (let [entryUrl, entry] of entries) { 153 if (entry.actors.delete(actor)) { 154 if (entry.actors.size == 0) { 155 this.#cache.delete(entryUrl); 156 } 157 } 158 } 159 } 160 } 161 162 /** 163 * @typedef {object} PageData 164 * A set of discovered from a page. Other than the `data` property this is the 165 * schema at `browser/components/pagedata/schemas/general.schema.json`. 166 * @property {string} url 167 * The page's url. 168 * @property {number} date 169 * The epoch based timestamp for when the data was discovered. 170 * @property {string} siteName 171 * The page's friendly site name. 172 * @property {string} image 173 * The page's image. 174 * @property {object} data 175 * The map of data found which may be empty if no data was found. The key in 176 * map is from the `PageDataSchema.DATA_TYPE` enumeration. The values are in 177 * the format defined by the schemas at `browser/components/pagedata/schemas`. 178 */ 179 180 export const PageDataService = new (class PageDataService extends EventEmitter { 181 /** 182 * Caches page data discovered from browsers. 183 * 184 * @type {PageDataCache} 185 */ 186 #pageDataCache = new PageDataCache(); 187 188 /** 189 * The number of currently running background fetches. 190 * 191 * @type {number} 192 */ 193 #backgroundFetches = 0; 194 195 /** 196 * The list of urls waiting to be loaded in the background. 197 * 198 * @type {Set<string>} 199 */ 200 #backgroundQueue = new Set(); 201 202 /** 203 * Tracks whether the user is currently idle. 204 * 205 * @type {boolean} 206 */ 207 #userIsIdle = false; 208 209 /** 210 * A map of hidden browsers to a resolve function that should be passed the 211 * actor that was created for the browser. 212 * 213 * @type {WeakMap<Browser, function(PageDataParent): void>} 214 */ 215 #backgroundBrowsers = new WeakMap(); 216 217 /** 218 * Tracks windows that have browsers with entries in the cache. 219 * 220 * @type {Map<Window, Set<Browser>>} 221 */ 222 #trackedWindows = new Map(); 223 224 /** 225 * Constructs the service. 226 */ 227 constructor() { 228 super(); 229 230 // Limits the number of background fetches that will run at once. Set to 0 to 231 // effectively allow an infinite number. 232 XPCOMUtils.defineLazyPreferenceGetter( 233 this, 234 "MAX_BACKGROUND_FETCHES", 235 "browser.pagedata.maxBackgroundFetches", 236 5, 237 () => this.#startBackgroundWorkers() 238 ); 239 } 240 241 /** 242 * Initializes a new instance of the service, not called externally. 243 */ 244 init() { 245 if (!Services.prefs.getBoolPref("browser.pagedata.enabled", false)) { 246 return; 247 } 248 249 ChromeUtils.registerWindowActor("PageData", { 250 parent: { 251 esModuleURI: "resource:///actors/PageDataParent.sys.mjs", 252 }, 253 child: { 254 esModuleURI: "resource:///actors/PageDataChild.sys.mjs", 255 events: { 256 DOMContentLoaded: {}, 257 pageshow: {}, 258 }, 259 }, 260 }); 261 262 lazy.logConsole.debug("Service started"); 263 264 for (let win of lazy.BrowserWindowTracker.orderedWindows) { 265 if (!win.closed) { 266 // Ask any existing tabs to report 267 for (let tab of win.gBrowser.tabs) { 268 let parent = 269 tab.linkedBrowser.browsingContext?.currentWindowGlobal.getActor( 270 "PageData" 271 ); 272 273 parent.sendAsyncMessage("PageData:CheckLoaded"); 274 } 275 } 276 } 277 278 lazy.idleService.addIdleObserver(this, lazy.fetchIdleTime); 279 } 280 281 /** 282 * Called when the service is destroyed. This is generally on shutdown so we 283 * don't really need to do much cleanup. 284 */ 285 uninit() { 286 lazy.logConsole.debug("Service stopped"); 287 } 288 289 /** 290 * Starts tracking for when a browser is destroyed. 291 * 292 * @param {Browser} browser 293 * The browser to track. 294 */ 295 #trackBrowser(browser) { 296 let window = browser.ownerGlobal; 297 298 let browsers = this.#trackedWindows.get(window); 299 if (browsers) { 300 browsers.add(browser); 301 302 // This window is already being tracked, no need to add listeners. 303 return; 304 } 305 306 browsers = new Set([browser]); 307 this.#trackedWindows.set(window, browsers); 308 309 window.addEventListener("unload", () => { 310 for (let closedBrowser of browsers) { 311 this.unlockEntry(closedBrowser); 312 } 313 314 this.#trackedWindows.delete(window); 315 }); 316 317 window.addEventListener("TabClose", ({ target: tab }) => { 318 // Unlock any entries locked by this browser. 319 let closedBrowser = tab.linkedBrowser; 320 this.unlockEntry(closedBrowser); 321 browsers.delete(closedBrowser); 322 }); 323 } 324 325 /** 326 * Requests that any page data for this url is retained in memory until 327 * unlocked. By calling this you are committing to later call `unlockEntry` 328 * with the same `actor` and `url` parameters. 329 * 330 * @param {object} actor 331 * The actor requesting the lock. 332 * @param {string} url 333 * The url of the page to lock. 334 */ 335 lockEntry(actor, url) { 336 this.#pageDataCache.lockData(actor, url); 337 } 338 339 /** 340 * Notifies that an actor is no longer interested in a url. 341 * 342 * @param {object} actor 343 * The actor that requested the lock. 344 * @param {string | undefined} [url] 345 * The url of the page or undefined to unlock all urls locked by this actor. 346 */ 347 unlockEntry(actor, url) { 348 this.#pageDataCache.unlockData(actor, url); 349 } 350 351 /** 352 * Called when the content process signals that a page is ready for data 353 * collection. 354 * 355 * @param {PageDataParent} actor 356 * The parent actor for the page. 357 * @param {string} url 358 * The url of the page. 359 */ 360 async pageLoaded(actor, url) { 361 if (!ALLOWED_PROTOCOLS.has(new URL(url).protocol)) { 362 return; 363 } 364 365 let browser = actor.browsingContext?.embedderElement; 366 367 // If we don't have a browser then it went away before we could record, 368 // so we don't know where the data came from. 369 if (!browser) { 370 return; 371 } 372 373 // Is this a load in a background browser? 374 let backgroundResolve = this.#backgroundBrowsers.get(browser); 375 if (backgroundResolve) { 376 backgroundResolve(actor); 377 return; 378 } 379 380 // Otherwise we only care about pages loaded in the tab browser. 381 if (!this.#isATabBrowser(browser)) { 382 return; 383 } 384 385 try { 386 let data = await actor.collectPageData(); 387 if (data) { 388 // Keep this data alive until the browser is destroyed. 389 this.#trackBrowser(browser); 390 this.lockEntry(browser, data.url); 391 392 this.pageDataDiscovered(data); 393 } 394 } catch (e) { 395 lazy.logConsole.error(e); 396 } 397 } 398 399 /** 400 * Adds data for a url. This should generally only be called by other components of the 401 * page data service or tests for simulating page data collection. 402 * 403 * @param {PageData} pageData 404 * The set of data discovered. 405 */ 406 pageDataDiscovered(pageData) { 407 lazy.logConsole.debug("Discovered page data", pageData); 408 409 this.#pageDataCache.set(pageData.url, { 410 ...pageData, 411 data: pageData.data ?? {}, 412 }); 413 414 // Send out a notification. 415 this.emit("page-data", pageData); 416 } 417 418 /** 419 * Retrieves any cached page data. Returns null if there is no information in the cache, this will 420 * happen either if the page has not been browsed recently or if data collection failed for some 421 * reason. 422 * 423 * @param {string} url 424 * The url to retrieve data for. 425 * @returns {PageData|null} 426 * A `PageData` if one is cached (it may not actually contain any items of data) or null if this 427 * page has not been successfully checked for data recently. 428 */ 429 getCached(url) { 430 return this.#pageDataCache.get(url); 431 } 432 433 /** 434 * Fetches page data from the given URL using a hidden window. Note that this does not populate 435 * the page data cache or emit the `page-data` event. 436 * 437 * @param {string} url 438 * The url to retrieve data for. 439 * @returns {Promise<PageData|null>} 440 * Resolves to the found pagedata or null in case of error. 441 */ 442 async fetchPageData(url) { 443 return lazy.HiddenBrowserManager.withHiddenBrowser(async browser => { 444 try { 445 let { promise, resolve } = Promise.withResolvers(); 446 this.#backgroundBrowsers.set(browser, resolve); 447 448 let principal = Services.scriptSecurityManager.getSystemPrincipal(); 449 let loadURIOptions = { 450 triggeringPrincipal: principal, 451 }; 452 browser.fixupAndLoadURIString(url, loadURIOptions); 453 454 let actor = await promise; 455 return await actor.collectPageData(); 456 } finally { 457 this.#backgroundBrowsers.delete(browser); 458 } 459 }); 460 } 461 462 /** 463 * Handles notifications from the idle service. 464 * 465 * @param {nsISupports} subject 466 * The notification's subject. 467 * @param {string} topic 468 * The notification topic. 469 */ 470 observe(subject, topic) { 471 switch (topic) { 472 case "idle": 473 lazy.logConsole.debug("User went idle"); 474 this.#userIsIdle = true; 475 this.#startBackgroundWorkers(); 476 break; 477 case "active": 478 lazy.logConsole.debug("User became active"); 479 this.#userIsIdle = false; 480 break; 481 } 482 } 483 484 /** 485 * Starts as many background workers as are allowed to process the background 486 * queue. 487 */ 488 #startBackgroundWorkers() { 489 if (!this.#userIsIdle) { 490 return; 491 } 492 493 let toStart; 494 495 if (this.MAX_BACKGROUND_FETCHES) { 496 toStart = this.MAX_BACKGROUND_FETCHES - this.#backgroundFetches; 497 } else { 498 toStart = this.#backgroundQueue.size; 499 } 500 501 for (let i = 0; i < toStart; i++) { 502 this.#backgroundFetch(); 503 } 504 } 505 506 /** 507 * Starts a background fetch worker which will pull urls from the queue and 508 * load them until the queue is empty. 509 */ 510 async #backgroundFetch() { 511 this.#backgroundFetches++; 512 513 let url = shift(this.#backgroundQueue); 514 while (url) { 515 try { 516 let pageData = await this.fetchPageData(url); 517 518 if (pageData) { 519 this.#pageDataCache.set(url, pageData); 520 this.emit("page-data", pageData); 521 } 522 } catch (e) { 523 lazy.logConsole.error(e); 524 } 525 526 // Check whether the user became active or the worker limit changed 527 // dynamically. 528 if ( 529 !this.#userIsIdle || 530 (this.MAX_BACKGROUND_FETCHES > 0 && 531 this.#backgroundFetches > this.MAX_BACKGROUND_FETCHES) 532 ) { 533 break; 534 } 535 536 url = shift(this.#backgroundQueue); 537 } 538 539 this.#backgroundFetches--; 540 } 541 542 /** 543 * Queues page data retrieval for a url. The page-data notification will be 544 * generated if data becomes available. 545 * 546 * Check `getCached` first to ensure that data is not already in the cache. 547 * 548 * @param {string} url 549 * The url to retrieve data for. 550 */ 551 queueFetch(url) { 552 this.#backgroundQueue.add(url); 553 554 this.#startBackgroundWorkers(); 555 } 556 557 /** 558 * Determines if the given browser is contained within a tab. 559 * 560 * @param {DOMElement} browser 561 * The browser element to check. 562 * @returns {boolean} 563 * True if the browser element is contained within a tab. 564 */ 565 #isATabBrowser(browser) { 566 return browser.ownerGlobal.gBrowser?.getTabForBrowser(browser); 567 } 568 })();