tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

PageDataService.sys.mjs (14888B)


      1 /* This Source Code Form is subject to the terms of the Mozilla Public
      2 * License, v. 2.0. If a copy of the MPL was not distributed with this
      3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      4 
      5 import { XPCOMUtils } from "resource://gre/modules/XPCOMUtils.sys.mjs";
      6 
      7 import { EventEmitter } from "resource://gre/modules/EventEmitter.sys.mjs";
      8 
      9 const lazy = {};
     10 
     11 ChromeUtils.defineESModuleGetters(lazy, {
     12  BrowserWindowTracker: "resource:///modules/BrowserWindowTracker.sys.mjs",
     13  HiddenBrowserManager: "resource://gre/modules/HiddenFrame.sys.mjs",
     14 });
     15 
     16 ChromeUtils.defineLazyGetter(lazy, "logConsole", function () {
     17  return console.createInstance({
     18    prefix: "PageData",
     19    maxLogLevel: Services.prefs.getBoolPref("browser.pagedata.log", false)
     20      ? "Debug"
     21      : "Warn",
     22  });
     23 });
     24 
     25 XPCOMUtils.defineLazyServiceGetters(lazy, {
     26  idleService: ["@mozilla.org/widget/useridleservice;1", Ci.nsIUserIdleService],
     27 });
     28 
     29 XPCOMUtils.defineLazyPreferenceGetter(
     30  lazy,
     31  "fetchIdleTime",
     32  "browser.pagedata.fetchIdleTime",
     33  300
     34 );
     35 
     36 const ALLOWED_PROTOCOLS = new Set(["http:", "https:", "data:", "blob:"]);
     37 
     38 /**
     39 * Shifts the first element out of the set.
     40 *
     41 * @param {Set<T>} set
     42 *   The set containing elements.
     43 * @returns {T | undefined} The first element in the set or undefined if
     44 *   there is nothing in the set.
     45 */
     46 function shift(set) {
     47  let iter = set.values();
     48  let { value, done } = iter.next();
     49 
     50  if (done) {
     51    return undefined;
     52  }
     53 
     54  set.delete(value);
     55  return value;
     56 }
     57 
     58 /**
     59 * @typedef {object} CacheEntry
     60 *   An entry in the page data cache.
     61 * @property {PageData | null} pageData
     62 *   The data or null if there is no known data.
     63 * @property {Set} actors
     64 *   The actors that maintain an interest in keeping the entry cached.
     65 */
     66 
     67 /**
     68 * A cache of page data kept in memory. By default any discovered data from
     69 * browsers is kept in memory until the browser element is destroyed but other
     70 * actors may register an interest in keeping an entry alive beyond that.
     71 */
     72 class PageDataCache {
     73  /**
     74   * The contents of the cache. Keyed on page url.
     75   *
     76   * @type {Map<string, CacheEntry>}
     77   */
     78  #cache = new Map();
     79 
     80  /**
     81   * Creates or updates an entry in the cache. If no actor has registered any
     82   * interest in keeping this page's data in memory then this will do nothing.
     83   *
     84   * @param {string} url
     85   *   The url of the page.
     86   * @param {PageData|null} pageData
     87   *   The current page data for the page.
     88   */
     89  set(url, pageData) {
     90    let entry = this.#cache.get(url);
     91 
     92    if (entry) {
     93      entry.pageData = pageData;
     94    }
     95  }
     96 
     97  /**
     98   * Gets any cached data for the url.
     99   *
    100   * @param {string} url
    101   *   The url of the page.
    102   * @returns {PageData | null}
    103   *   The page data if some is known.
    104   */
    105  get(url) {
    106    let entry = this.#cache.get(url);
    107    return entry?.pageData ?? null;
    108  }
    109 
    110  /**
    111   * Adds a lock to an entry. This can be called before we have discovered the
    112   * data for the url.
    113   *
    114   * @param {object} actor
    115   *   Ensures the entry stays in memory until unlocked by this actor.
    116   * @param {string} url
    117   *   The url of the page.
    118   */
    119  lockData(actor, url) {
    120    let entry = this.#cache.get(url);
    121    if (entry) {
    122      entry.actors.add(actor);
    123    } else {
    124      this.#cache.set(url, {
    125        pageData: undefined,
    126        actors: new Set([actor]),
    127      });
    128    }
    129  }
    130 
    131  /**
    132   * Removes a lock from an entry.
    133   *
    134   * @param {object} actor
    135   *   The lock to remove.
    136   * @param {string | undefined} [url]
    137   *   The url of the page or undefined to unlock all urls locked by this actor.
    138   */
    139  unlockData(actor, url) {
    140    let entries = [];
    141    if (url) {
    142      let entry = this.#cache.get(url);
    143      if (!entry) {
    144        return;
    145      }
    146 
    147      entries.push([url, entry]);
    148    } else {
    149      entries = [...this.#cache];
    150    }
    151 
    152    for (let [entryUrl, entry] of entries) {
    153      if (entry.actors.delete(actor)) {
    154        if (entry.actors.size == 0) {
    155          this.#cache.delete(entryUrl);
    156        }
    157      }
    158    }
    159  }
    160 }
    161 
    162 /**
    163 * @typedef {object} PageData
    164 *   A set of discovered from a page. Other than the `data` property this is the
    165 *   schema at `browser/components/pagedata/schemas/general.schema.json`.
    166 * @property {string} url
    167 *   The page's url.
    168 * @property {number} date
    169 *   The epoch based timestamp for when the data was discovered.
    170 * @property {string} siteName
    171 *   The page's friendly site name.
    172 * @property {string} image
    173 *   The page's image.
    174 * @property {object} data
    175 *   The map of data found which may be empty if no data was found. The key in
    176 *   map is from the `PageDataSchema.DATA_TYPE` enumeration. The values are in
    177 *   the format defined by the schemas at `browser/components/pagedata/schemas`.
    178 */
    179 
    180 export const PageDataService = new (class PageDataService extends EventEmitter {
    181  /**
    182   * Caches page data discovered from browsers.
    183   *
    184   * @type {PageDataCache}
    185   */
    186  #pageDataCache = new PageDataCache();
    187 
    188  /**
    189   * The number of currently running background fetches.
    190   *
    191   * @type {number}
    192   */
    193  #backgroundFetches = 0;
    194 
    195  /**
    196   * The list of urls waiting to be loaded in the background.
    197   *
    198   * @type {Set<string>}
    199   */
    200  #backgroundQueue = new Set();
    201 
    202  /**
    203   * Tracks whether the user is currently idle.
    204   *
    205   * @type {boolean}
    206   */
    207  #userIsIdle = false;
    208 
    209  /**
    210   * A map of hidden browsers to a resolve function that should be passed the
    211   * actor that was created for the browser.
    212   *
    213   * @type {WeakMap<Browser, function(PageDataParent): void>}
    214   */
    215  #backgroundBrowsers = new WeakMap();
    216 
    217  /**
    218   * Tracks windows that have browsers with entries in the cache.
    219   *
    220   * @type {Map<Window, Set<Browser>>}
    221   */
    222  #trackedWindows = new Map();
    223 
    224  /**
    225   * Constructs the service.
    226   */
    227  constructor() {
    228    super();
    229 
    230    // Limits the number of background fetches that will run at once. Set to 0 to
    231    // effectively allow an infinite number.
    232    XPCOMUtils.defineLazyPreferenceGetter(
    233      this,
    234      "MAX_BACKGROUND_FETCHES",
    235      "browser.pagedata.maxBackgroundFetches",
    236      5,
    237      () => this.#startBackgroundWorkers()
    238    );
    239  }
    240 
    241  /**
    242   * Initializes a new instance of the service, not called externally.
    243   */
    244  init() {
    245    if (!Services.prefs.getBoolPref("browser.pagedata.enabled", false)) {
    246      return;
    247    }
    248 
    249    ChromeUtils.registerWindowActor("PageData", {
    250      parent: {
    251        esModuleURI: "resource:///actors/PageDataParent.sys.mjs",
    252      },
    253      child: {
    254        esModuleURI: "resource:///actors/PageDataChild.sys.mjs",
    255        events: {
    256          DOMContentLoaded: {},
    257          pageshow: {},
    258        },
    259      },
    260    });
    261 
    262    lazy.logConsole.debug("Service started");
    263 
    264    for (let win of lazy.BrowserWindowTracker.orderedWindows) {
    265      if (!win.closed) {
    266        // Ask any existing tabs to report
    267        for (let tab of win.gBrowser.tabs) {
    268          let parent =
    269            tab.linkedBrowser.browsingContext?.currentWindowGlobal.getActor(
    270              "PageData"
    271            );
    272 
    273          parent.sendAsyncMessage("PageData:CheckLoaded");
    274        }
    275      }
    276    }
    277 
    278    lazy.idleService.addIdleObserver(this, lazy.fetchIdleTime);
    279  }
    280 
    281  /**
    282   * Called when the service is destroyed. This is generally on shutdown so we
    283   * don't really need to do much cleanup.
    284   */
    285  uninit() {
    286    lazy.logConsole.debug("Service stopped");
    287  }
    288 
    289  /**
    290   * Starts tracking for when a browser is destroyed.
    291   *
    292   * @param {Browser} browser
    293   *   The browser to track.
    294   */
    295  #trackBrowser(browser) {
    296    let window = browser.ownerGlobal;
    297 
    298    let browsers = this.#trackedWindows.get(window);
    299    if (browsers) {
    300      browsers.add(browser);
    301 
    302      // This window is already being tracked, no need to add listeners.
    303      return;
    304    }
    305 
    306    browsers = new Set([browser]);
    307    this.#trackedWindows.set(window, browsers);
    308 
    309    window.addEventListener("unload", () => {
    310      for (let closedBrowser of browsers) {
    311        this.unlockEntry(closedBrowser);
    312      }
    313 
    314      this.#trackedWindows.delete(window);
    315    });
    316 
    317    window.addEventListener("TabClose", ({ target: tab }) => {
    318      // Unlock any entries locked by this browser.
    319      let closedBrowser = tab.linkedBrowser;
    320      this.unlockEntry(closedBrowser);
    321      browsers.delete(closedBrowser);
    322    });
    323  }
    324 
    325  /**
    326   * Requests that any page data for this url is retained in memory until
    327   * unlocked. By calling this you are committing to later call `unlockEntry`
    328   * with the same `actor` and `url` parameters.
    329   *
    330   * @param {object} actor
    331   *   The actor requesting the lock.
    332   * @param {string} url
    333   *   The url of the page to lock.
    334   */
    335  lockEntry(actor, url) {
    336    this.#pageDataCache.lockData(actor, url);
    337  }
    338 
    339  /**
    340   * Notifies that an actor is no longer interested in a url.
    341   *
    342   * @param {object} actor
    343   *   The actor that requested the lock.
    344   * @param {string | undefined} [url]
    345   *   The url of the page or undefined to unlock all urls locked by this actor.
    346   */
    347  unlockEntry(actor, url) {
    348    this.#pageDataCache.unlockData(actor, url);
    349  }
    350 
    351  /**
    352   * Called when the content process signals that a page is ready for data
    353   * collection.
    354   *
    355   * @param {PageDataParent} actor
    356   *   The parent actor for the page.
    357   * @param {string} url
    358   *   The url of the page.
    359   */
    360  async pageLoaded(actor, url) {
    361    if (!ALLOWED_PROTOCOLS.has(new URL(url).protocol)) {
    362      return;
    363    }
    364 
    365    let browser = actor.browsingContext?.embedderElement;
    366 
    367    // If we don't have a browser then it went away before we could record,
    368    // so we don't know where the data came from.
    369    if (!browser) {
    370      return;
    371    }
    372 
    373    // Is this a load in a background browser?
    374    let backgroundResolve = this.#backgroundBrowsers.get(browser);
    375    if (backgroundResolve) {
    376      backgroundResolve(actor);
    377      return;
    378    }
    379 
    380    // Otherwise we only care about pages loaded in the tab browser.
    381    if (!this.#isATabBrowser(browser)) {
    382      return;
    383    }
    384 
    385    try {
    386      let data = await actor.collectPageData();
    387      if (data) {
    388        // Keep this data alive until the browser is destroyed.
    389        this.#trackBrowser(browser);
    390        this.lockEntry(browser, data.url);
    391 
    392        this.pageDataDiscovered(data);
    393      }
    394    } catch (e) {
    395      lazy.logConsole.error(e);
    396    }
    397  }
    398 
    399  /**
    400   * Adds data for a url. This should generally only be called by other components of the
    401   * page data service or tests for simulating page data collection.
    402   *
    403   * @param {PageData} pageData
    404   *   The set of data discovered.
    405   */
    406  pageDataDiscovered(pageData) {
    407    lazy.logConsole.debug("Discovered page data", pageData);
    408 
    409    this.#pageDataCache.set(pageData.url, {
    410      ...pageData,
    411      data: pageData.data ?? {},
    412    });
    413 
    414    // Send out a notification.
    415    this.emit("page-data", pageData);
    416  }
    417 
    418  /**
    419   * Retrieves any cached page data. Returns null if there is no information in the cache, this will
    420   * happen either if the page has not been browsed recently or if data collection failed for some
    421   * reason.
    422   *
    423   * @param {string} url
    424   *   The url to retrieve data for.
    425   * @returns {PageData|null}
    426   *   A `PageData` if one is cached (it may not actually contain any items of data) or null if this
    427   *   page has not been successfully checked for data recently.
    428   */
    429  getCached(url) {
    430    return this.#pageDataCache.get(url);
    431  }
    432 
    433  /**
    434   * Fetches page data from the given URL using a hidden window. Note that this does not populate
    435   * the page data cache or emit the `page-data` event.
    436   *
    437   * @param {string} url
    438   *   The url to retrieve data for.
    439   * @returns {Promise<PageData|null>}
    440   *   Resolves to the found pagedata or null in case of error.
    441   */
    442  async fetchPageData(url) {
    443    return lazy.HiddenBrowserManager.withHiddenBrowser(async browser => {
    444      try {
    445        let { promise, resolve } = Promise.withResolvers();
    446        this.#backgroundBrowsers.set(browser, resolve);
    447 
    448        let principal = Services.scriptSecurityManager.getSystemPrincipal();
    449        let loadURIOptions = {
    450          triggeringPrincipal: principal,
    451        };
    452        browser.fixupAndLoadURIString(url, loadURIOptions);
    453 
    454        let actor = await promise;
    455        return await actor.collectPageData();
    456      } finally {
    457        this.#backgroundBrowsers.delete(browser);
    458      }
    459    });
    460  }
    461 
    462  /**
    463   * Handles notifications from the idle service.
    464   *
    465   * @param {nsISupports} subject
    466   *   The notification's subject.
    467   * @param {string} topic
    468   *   The notification topic.
    469   */
    470  observe(subject, topic) {
    471    switch (topic) {
    472      case "idle":
    473        lazy.logConsole.debug("User went idle");
    474        this.#userIsIdle = true;
    475        this.#startBackgroundWorkers();
    476        break;
    477      case "active":
    478        lazy.logConsole.debug("User became active");
    479        this.#userIsIdle = false;
    480        break;
    481    }
    482  }
    483 
    484  /**
    485   * Starts as many background workers as are allowed to process the background
    486   * queue.
    487   */
    488  #startBackgroundWorkers() {
    489    if (!this.#userIsIdle) {
    490      return;
    491    }
    492 
    493    let toStart;
    494 
    495    if (this.MAX_BACKGROUND_FETCHES) {
    496      toStart = this.MAX_BACKGROUND_FETCHES - this.#backgroundFetches;
    497    } else {
    498      toStart = this.#backgroundQueue.size;
    499    }
    500 
    501    for (let i = 0; i < toStart; i++) {
    502      this.#backgroundFetch();
    503    }
    504  }
    505 
    506  /**
    507   * Starts a background fetch worker which will pull urls from the queue and
    508   * load them until the queue is empty.
    509   */
    510  async #backgroundFetch() {
    511    this.#backgroundFetches++;
    512 
    513    let url = shift(this.#backgroundQueue);
    514    while (url) {
    515      try {
    516        let pageData = await this.fetchPageData(url);
    517 
    518        if (pageData) {
    519          this.#pageDataCache.set(url, pageData);
    520          this.emit("page-data", pageData);
    521        }
    522      } catch (e) {
    523        lazy.logConsole.error(e);
    524      }
    525 
    526      // Check whether the user became active or the worker limit changed
    527      // dynamically.
    528      if (
    529        !this.#userIsIdle ||
    530        (this.MAX_BACKGROUND_FETCHES > 0 &&
    531          this.#backgroundFetches > this.MAX_BACKGROUND_FETCHES)
    532      ) {
    533        break;
    534      }
    535 
    536      url = shift(this.#backgroundQueue);
    537    }
    538 
    539    this.#backgroundFetches--;
    540  }
    541 
    542  /**
    543   * Queues page data retrieval for a url. The page-data notification will be
    544   * generated if data becomes available.
    545   *
    546   * Check `getCached` first to ensure that data is not already in the cache.
    547   *
    548   * @param {string} url
    549   *   The url to retrieve data for.
    550   */
    551  queueFetch(url) {
    552    this.#backgroundQueue.add(url);
    553 
    554    this.#startBackgroundWorkers();
    555  }
    556 
    557  /**
    558   * Determines if the given browser is contained within a tab.
    559   *
    560   * @param {DOMElement} browser
    561   *   The browser element to check.
    562   * @returns {boolean}
    563   *   True if the browser element is contained within a tab.
    564   */
    565  #isATabBrowser(browser) {
    566    return browser.ownerGlobal.gBrowser?.getTabForBrowser(browser);
    567  }
    568 })();