[ tor-browser ].git.dasho

LinkPreviewModel.sys.mjs (18455B)
      1 /**
      2 * This Source Code Form is subject to the terms of the Mozilla Public
      3 * License, v. 2.0. If a copy of the MPL was not distributed with this
      4 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
      5 */
      6 import { XPCOMUtils } from "resource://gre/modules/XPCOMUtils.sys.mjs";
      7 
      8 // On average, each token represents about 4 characters. A factor of 3.5 is used
      9 // instead of 4 to account for edge cases.
     10 const CHARACTERS_PER_TOKEN = 3.5;
     11 // On average, one token corresponds to approximately 4 characters, meaning 0.25
     12 // times the character count would suffice under normal conditions. To ensure
     13 // robustness and handle edge cases, we use a more conservative factor of 0.69.
     14 const CONTEXT_SIZE_MULTIPLIER = 0.69;
     15 const DEFAULT_INPUT_SENTENCES = 6;
     16 const MIN_SENTENCE_LENGTH = 14;
     17 const MIN_WORD_COUNT = 5;
     18 const DEFAULT_INPUT_PROMPT =
     19  "You're an AI assistant for text re-writing and summarization. Rewrite the input text focusing on the main key point in at most three very short sentences.";
     20 
     21 // All tokens taken from the model's vocabulary at https://huggingface.co/HuggingFaceTB/SmolLM2-360M-Instruct/raw/main/vocab.json
     22 // Token id for end of text
     23 const END_OF_TEXT_TOKEN = 0;
     24 // Token id for beginning of sequence
     25 const BOS_TOKEN = 1;
     26 // Token id for end of sequence
     27 const EOS_TOKEN = 2;
     28 
     29 const lazy = {};
     30 ChromeUtils.defineESModuleGetters(lazy, {
     31  createEngine: "chrome://global/content/ml/EngineProcess.sys.mjs",
     32  Progress: "chrome://global/content/ml/Utils.sys.mjs",
     33  BlockListManager: "chrome://global/content/ml/Utils.sys.mjs",
     34  RemoteSettingsManager: "chrome://global/content/ml/Utils.sys.mjs",
     35 });
     36 XPCOMUtils.defineLazyPreferenceGetter(
     37  lazy,
     38  "config",
     39  "browser.ml.linkPreview.config",
     40  "{}"
     41 );
     42 XPCOMUtils.defineLazyPreferenceGetter(
     43  lazy,
     44  "inputSentences",
     45  "browser.ml.linkPreview.inputSentences"
     46 );
     47 XPCOMUtils.defineLazyPreferenceGetter(
     48  lazy,
     49  "outputSentences",
     50  "browser.ml.linkPreview.outputSentences"
     51 );
     52 XPCOMUtils.defineLazyPreferenceGetter(
     53  lazy,
     54  "prompt",
     55  "browser.ml.linkPreview.prompt"
     56 );
     57 XPCOMUtils.defineLazyPreferenceGetter(
     58  lazy,
     59  "blockListEnabled",
     60  "browser.ml.linkPreview.blockListEnabled"
     61 );
     62 XPCOMUtils.defineLazyPreferenceGetter(
     63  lazy,
     64  "preUserPrompt",
     65  "browser.ml.linkPreview.preUserPrompt",
     66  ""
     67 );
     68 XPCOMUtils.defineLazyPreferenceGetter(
     69  lazy,
     70  "postUserPrompt",
     71  "browser.ml.linkPreview.postUserPrompt",
     72  ""
     73 );
     74 
     75 XPCOMUtils.defineLazyPreferenceGetter(
     76  lazy,
     77  "penalizedTokens",
     78  "browser.ml.linkPreview.penalizedTokens",
     79  // default (when PREF_INVALID)
     80  // Tokens with newlines for the default link preview model, based on the model's vocab: https://huggingface.co/HuggingFaceTB/SmolLM2-360M-Instruct/raw/main/vocab.json
     81  JSON.stringify([
     82    198, 448, 466, 472, 629, 945, 1004, 1047, 1116, 1410, 1927, 2367, 2738,
     83    2830, 2953, 3136, 3299, 3337, 3354, 3558, 3717, 3805, 3914, 4602, 4767,
     84    5952, 7116, 7209, 7338, 7396, 8301, 8500, 8821, 8866, 9198, 9225, 9343,
     85    9694, 10459, 11181, 11259, 11539, 11813, 12350, 13002, 13272, 13280, 13596,
     86    13617, 13809, 14436, 14446, 15111, 15182, 15290, 15537, 16140, 16299, 16390,
     87    16506, 16871, 16980, 16997, 18682, 18850, 18864, 19014, 19145, 19993, 20098,
     88    20370, 20793, 21193, 21377, 21941, 22342, 22369, 23004, 23386, 23499, 23799,
     89    24112, 24205, 25457, 25576, 26675, 26886, 26925, 27536, 27924, 28577, 29306,
     90    29866, 30314, 30544, 30799, 31464, 32057, 32315, 32829, 34344, 34356, 35163,
     91    35988, 36176, 36286, 36328, 36489, 36496, 36804, 37468, 38028, 38031, 39014,
     92    39843, 39892, 40677, 40944, 42057, 42617, 43784, 43902, 44064, 46778, 47213,
     93    47647, 48259, 48279, 48818,
     94  ]),
     95  null, // no onUpdate callback
     96  rawValue => {
     97    if (!rawValue) {
     98      return [];
     99    }
    100    return JSON.parse(rawValue);
    101  }
    102 );
    103 
    104 XPCOMUtils.defineLazyPreferenceGetter(
    105  lazy,
    106  "minWordsPerOutputSentences",
    107  "browser.ml.linkPreview.minWordsPerOutputSentences",
    108  0
    109 );
    110 
    111 // End of generation tokens.
    112 XPCOMUtils.defineLazyPreferenceGetter(
    113  lazy,
    114  "stopTokens",
    115  "browser.ml.linkPreview.stopTokens",
    116  // default (when PREF_INVALID)
    117  JSON.stringify([END_OF_TEXT_TOKEN, BOS_TOKEN, EOS_TOKEN]),
    118  null, // no onUpdate callback
    119  rawValue => {
    120    if (!rawValue) {
    121      return [];
    122    }
    123    return JSON.parse(rawValue);
    124  }
    125 );
    126 
    127 export const LinkPreviewModel = {
    128  /**
    129   * Manager for the block list. If null, no block list is applied.
    130   *
    131   * @type {BlockListManager}
    132   */
    133  blockListManager: null,
    134 
    135  /**
    136   * Blocked token list
    137   *
    138   * @returns {Array<number>} block token list
    139   */
    140  getBlockTokenList() {
    141    return lazy.penalizedTokens;
    142  },
    143  /**
    144   * Extracts sentences from a given text.
    145   *
    146   * @param {string} text text to process
    147   * @returns {Array<string>} sentences
    148   */
    149  getSentences(text) {
    150    const abbreviations = [
    151      "Mr.",
    152      "Mrs.",
    153      "Ms.",
    154      "Dr.",
    155      "Prof.",
    156      "Inc.",
    157      "Ltd.",
    158      "Jr.",
    159      "Sr.",
    160      "St.",
    161      "e.g.",
    162      "i.e.",
    163      "U.S.A",
    164      "D.C.",
    165      "U.K.",
    166      "etc.",
    167      "a.m.",
    168      "p.m.",
    169      "D.",
    170      "Mass.",
    171      "Sen.",
    172      "Rep.",
    173      "No.",
    174      "Fig.",
    175      "vs.",
    176      "Mx.",
    177      "Ph.D.",
    178      "M.D.",
    179      "D.D.S.",
    180      "B.A.",
    181      "M.A.",
    182      "LL.B.",
    183      "LL.M.",
    184      "J.D.",
    185      "D.O.",
    186      "D.V.M.",
    187      "Psy.D.",
    188      "Ed.D.",
    189      "Eng.",
    190      "Co.",
    191      "Corp.",
    192      "Mt.",
    193      "Ft.",
    194      "U.S.",
    195      "U.S.A.",
    196      "E.U.",
    197      "et al.",
    198      "Nos.",
    199      "pp.",
    200      "Vol.",
    201      "Rev.",
    202      "Gen.",
    203      "Lt.",
    204      "Col.",
    205      "Maj.",
    206      "Capt.",
    207      "Sgt.",
    208      "Cpl.",
    209      "Pvt.",
    210      "Adm.",
    211      "Cmdr.",
    212      "Ave.",
    213      "Blvd.",
    214      "Rd.",
    215      "Ln.",
    216      "Jan.",
    217      "Feb.",
    218      "Mar.",
    219      "Apr.",
    220      "May.",
    221      "Jun.",
    222      "Jul.",
    223      "Aug.",
    224      "Sep.",
    225      "Sept.",
    226      "Oct.",
    227      "Nov.",
    228      "Dec.",
    229      "Mon.",
    230      "Tue.",
    231      "Tues.",
    232      "Wed.",
    233      "Thu.",
    234      "Thur.",
    235      "Thurs.",
    236      "Fri.",
    237      "Sat.",
    238      "Sun.",
    239      "Dept.",
    240      "Univ.",
    241      "Est.",
    242      "Calif.",
    243      "Fla.",
    244      "N.Y.",
    245      "Conn.",
    246      "Va.",
    247      "Ill.",
    248      "Assoc.",
    249      "Bros.",
    250      "Dist.",
    251      "Msgr.",
    252      "S.P.",
    253      "P.S.",
    254      "U.S.S.R.",
    255      "Mlle.",
    256      "Mme.",
    257      "Hon.",
    258      "Messrs.",
    259      "Mmes.",
    260      "v.",
    261      "vs.",
    262    ];
    263 
    264    // Replace periods in abbreviations with a placeholder.
    265    let modifiedText = text;
    266    const placeholder = "∯";
    267 
    268    abbreviations.forEach(abbrev => {
    269      const escapedAbbrev = abbrev
    270        .replace(/[.*+?^${}()|[\]\\]/g, "\\$&")
    271        .replace(/\\\./g, "\\.");
    272      const regex = new RegExp(escapedAbbrev, "g");
    273      const abbrevWithPlaceholder = abbrev.replace(/\./g, placeholder);
    274      modifiedText = modifiedText.replace(regex, abbrevWithPlaceholder);
    275    });
    276 
    277    const segmenter = new Intl.Segmenter("en", {
    278      granularity: "sentence",
    279    });
    280    const segments = segmenter.segment(modifiedText);
    281    let sentences = Array.from(segments, segment => segment.segment);
    282 
    283    // Restore the periods in abbreviations.
    284    return sentences.map(sentence =>
    285      sentence.replace(new RegExp(placeholder, "g"), ".")
    286    );
    287  },
    288 
    289  /**
    290   * Clean up text for text generation AI.
    291   *
    292   * @param {string} text to process
    293   * @param {number} maxNumSentences - Max number of sentences to return.
    294   * @returns {string} cleaned up text
    295   */
    296  preprocessText(
    297    text,
    298    maxNumSentences = lazy.inputSentences ?? DEFAULT_INPUT_SENTENCES
    299  ) {
    300    // Filter out emoji characters. The `u` flag is for unicode and `g` for global.
    301    // Use `Emoji_Presentation` to avoid removing numbers and other symbols.
    302    const textWithoutEmoji = text.replace(/\p{Emoji_Presentation}/gu, "");
    303    return (
    304      this.getSentences(textWithoutEmoji)
    305        .map(s =>
    306          // trim and replace consecutive blank by a single one.
    307          s.trim().replace(
    308            /(\s*\n\s*)|\s{2,}/g,
    309            // (\s*\n\s*)  -> Matches a newline (`\n`) surrounded by optional whitespace.
    310            // \s{2,}      -> Matches two or more consecutive spaces.
    311            // g           -> Global flag to replace all occurrences in the string.
    312 
    313            (_, newline) => (newline ? "\n" : " ")
    314            // Callback function:
    315            // `_`         -> First argument (full match) is ignored.
    316            // `newline`   -> If the first capturing group (\s*\n\s*) matched, `newline` is truthy.
    317            // If `newline` exists, it replaces the match with a single newline ("\n").
    318            // Otherwise, it replaces the match (extra spaces) with a single space (" ").
    319          )
    320        )
    321        // Remove sentences that are too short without punctuation.
    322        .filter(
    323          s =>
    324            s.length >= MIN_SENTENCE_LENGTH &&
    325            s.split(" ").length >= MIN_WORD_COUNT &&
    326            /\p{P}$/u.test(s)
    327        )
    328        .slice(0, maxNumSentences)
    329        .join(" ")
    330    );
    331  },
    332 
    333  /**
    334   * Creates a new ML engine instance with the provided options for link preview.
    335   *
    336   * @param {object} options - Configuration options for the ML engine.
    337   * @param {?function(ProgressAndStatusCallbackParams):void} notificationsCallback A function to call to indicate notifications.
    338   * @param {AbortSignal} abortSignal - The signal to abort the download.
    339   * @returns {Promise<MLEngine>} - A promise that resolves to the ML engine instance.
    340   */
    341  async createEngine(options, notificationsCallback = null, abortSignal) {
    342    return lazy.createEngine(options, notificationsCallback, abortSignal);
    343  },
    344 
    345  /**
    346   * Generate summary text using AI.
    347   *
    348   * @param {string} inputText
    349   * @param {object} callbacks for progress and error
    350   * @param {AbortSignal} callbacks.abortSignal - The signal to abort the download.
    351   * @param {Function} callbacks.onDownload optional for download active
    352   * @param {Function} callbacks.onText optional for text chunks
    353   * @param {Function} callbacks.onError optional for error
    354   */
    355  async generateTextAI(
    356    inputText,
    357    { onDownload, onText, onError, abortSignal } = {}
    358  ) {
    359    // Get updated options from remote settings. No failure if no record exists
    360    const remoteRequestRecord = await lazy.RemoteSettingsManager.getRemoteData({
    361      collectionName: "ml-inference-request-options",
    362      filters: { featureId: "link-preview" },
    363      majorVersion: 1,
    364    }).catch(() => {
    365      console.error(
    366        "Error retrieving request options from remote settings, will use default options."
    367      );
    368      return { options: "{}" };
    369    });
    370 
    371    let remoteRequestOptions = {};
    372 
    373    try {
    374      remoteRequestOptions = remoteRequestRecord?.options
    375        ? JSON.parse(remoteRequestRecord.options)
    376        : {};
    377    } catch (error) {
    378      console.error(
    379        "Error parsing the remote settings request options, will use default options.",
    380        error
    381      );
    382    }
    383 
    384    // TODO: Unit test that order of preference is correctly respected.
    385    const processedInput = this.preprocessText(
    386      inputText,
    387      lazy.inputSentences ??
    388        remoteRequestOptions?.inputSentences ??
    389        DEFAULT_INPUT_SENTENCES
    390    );
    391 
    392    // Asssume generated text is approximately the same length as the input.
    393    const nPredict = Math.ceil(processedInput.length / CHARACTERS_PER_TOKEN);
    394    const systemPrompt =
    395      lazy.prompt ?? remoteRequestOptions?.systemPrompt ?? DEFAULT_INPUT_PROMPT;
    396    // Estimate an upper bound for the required number of tokens. This estimate
    397    // must be large enough to include prompt tokens, input tokens, and
    398    // generated tokens.
    399    const numContext =
    400      Math.ceil(
    401        (processedInput.length + systemPrompt.length) * CONTEXT_SIZE_MULTIPLIER
    402      ) + nPredict;
    403 
    404    let engine;
    405    try {
    406      engine = await this.createEngine(
    407        {
    408          backend: "best-llama",
    409          engineId: "wllamapreview",
    410          kvCacheDtype: "q8_0",
    411          modelFile: "smollm2-360m-instruct-q8_0.gguf",
    412          modelHubRootUrl: "https://model-hub.mozilla.org",
    413          modelId: "HuggingFaceTB/SmolLM2-360M-Instruct-GGUF",
    414          modelRevision: "main",
    415          numBatch: numContext,
    416          numContext,
    417          numUbatch: numContext,
    418          taskName: "wllama-text-generation",
    419          timeoutMS: -1,
    420          useMlock: false,
    421          useMmap: true,
    422          ...JSON.parse(lazy.config),
    423        },
    424        data => {
    425          if (data.type == lazy.Progress.ProgressType.DOWNLOAD) {
    426            onDownload?.(
    427              data.statusText != lazy.Progress.ProgressStatusText.DONE,
    428              Math.round((100 * data.totalLoaded) / data.total)
    429            );
    430          }
    431        },
    432        abortSignal
    433      );
    434 
    435      const postProcessor = await SentencePostProcessor.initialize();
    436      const blockedTokens = this.getBlockTokenList();
    437      for await (const val of engine.runWithGenerator({
    438        nPredict,
    439        stopTokens: lazy.stopTokens,
    440        logit_bias_toks: blockedTokens,
    441        logit_bias_vals: Array(blockedTokens.length).fill(-Infinity),
    442        prompt: [
    443          { role: "system", content: systemPrompt },
    444          {
    445            role: "user",
    446            content: lazy.preUserPrompt + processedInput + lazy.postUserPrompt,
    447          },
    448        ],
    449      })) {
    450        const { sentence, abort } = postProcessor.put(val.text);
    451        if (sentence) {
    452          onText?.(sentence);
    453        } else if (!val.text) {
    454          const remaining = postProcessor.flush();
    455          if (remaining) {
    456            onText?.(remaining);
    457          }
    458        }
    459 
    460        if (abort) {
    461          break;
    462        }
    463      }
    464    } catch (error) {
    465      onError?.(error);
    466    } finally {
    467      await engine?.terminate();
    468    }
    469  },
    470 };
    471 
    472 /**
    473 * A class for processing streaming text to detect and extract complete
    474 * sentences. It buffers incoming text and periodically checks for new sentences
    475 * based on punctuation and character count limits.
    476 *
    477 * This class is useful for incremental sentence processing in NLP tasks.
    478 */
    479 export class SentencePostProcessor {
    480  /**
    481   * The maximum number of sentences to output before truncating the buffer.
    482   * Use -1 for unlimited.
    483   *
    484   * @type {number}
    485   */
    486  maxNumOutputSentences = -1;
    487 
    488  /**
    489   * Stores the current text being processed.
    490   *
    491   * @type {string}
    492   */
    493  currentText = "";
    494 
    495  /**
    496   * Tracks the number of sentences processed so far.
    497   *
    498   * @type {number}
    499   */
    500  currentNumSentences = 0;
    501 
    502  /**
    503   * Manager for the block list. If null, no block list is applied.
    504   *
    505   * @type {BlockListManager}
    506   */
    507  blockListManager = null;
    508 
    509  /**
    510   * Create an instance of the sentence postprocessor.
    511   *
    512   * @param {object} config - Configuration object.
    513   * @param {number} config.maxNumOutputSentences - The maximum number of sentences to
    514   * output before truncating the buffer.
    515   * @param {BlockListManager | null} config.blockListManager - Manager for the block list
    516   */
    517  constructor({
    518    maxNumOutputSentences = lazy.outputSentences,
    519    blockListManager,
    520  } = {}) {
    521    this.maxNumOutputSentences = maxNumOutputSentences;
    522    this.blockListManager = blockListManager;
    523  }
    524 
    525  /**
    526   * @param {object} config - Configuration object.
    527   * @param {number} config.maxNumOutputSentences - The maximum number of sentences to
    528   * output before truncating the buffer.
    529   * @param {boolean} config.blockListEnabled - Wether to enable block list. If enabled, we
    530   * don't return the sentence that has a blocked word along with any sentences coming after.
    531   * @returns {SentencePostProcessor} - An instance of SentencePostProcessor
    532   */
    533  static async initialize({
    534    maxNumOutputSentences = lazy.outputSentences,
    535    blockListEnabled = lazy.blockListEnabled,
    536  } = {}) {
    537    if (!blockListEnabled) {
    538      LinkPreviewModel.blockListManager = null;
    539    } else if (!LinkPreviewModel.blockListManager) {
    540      LinkPreviewModel.blockListManager =
    541        await lazy.BlockListManager.initializeFromRemoteSettings({
    542          blockListName: "link-preview-test-en",
    543          language: "en",
    544          fallbackToDefault: true,
    545          majorVersion: 1,
    546        });
    547    }
    548 
    549    return new SentencePostProcessor({
    550      maxNumOutputSentences,
    551      blockListManager: LinkPreviewModel.blockListManager,
    552    });
    553  }
    554 
    555  /**
    556   * Processes incoming text, checking if a full sentence has been completed. If
    557   * a full sentence is detected, it returns the first complete sentence.
    558   * Otherwise, it returns an empty string.
    559   *
    560   * @param {string} text to process
    561   * @returns {{ text: string, abort: boolean }} An object containing:
    562   *          - `{string} sentence`: The first complete sentence if available, otherwise an empty string.
    563   *          - `{boolean} abort`: `true` if generation should be aborted early, `false` otherwise.
    564   */
    565  put(text) {
    566    if (this.currentNumSentences == this.maxNumOutputSentences) {
    567      return { sentence: "", abort: true };
    568    }
    569    this.currentText += text;
    570 
    571    // We need to ensure that the current sentence is complete and the next
    572    // has started before reporting that a sentence is ready.
    573    const sentences = LinkPreviewModel.getSentences(this.currentText);
    574    let sentence = "";
    575    let abort = false;
    576    if (sentences.length >= 2) {
    577      this.currentText = sentences.slice(1).join("");
    578      // simple way to get number of word ignoring non-whitespaces chatacters
    579      const isValidSentence =
    580        sentences[0].trim().split(/\p{White_Space}+/u).length >=
    581        lazy.minWordsPerOutputSentences;
    582 
    583      if (isValidSentence) {
    584        this.currentNumSentences += 1;
    585      }
    586 
    587      if (this.currentNumSentences == this.maxNumOutputSentences) {
    588        this.currentText = "";
    589        abort = true;
    590      }
    591      if (isValidSentence) {
    592        sentence = sentences[0];
    593      }
    594 
    595      // If the sentence contains a block word, abort
    596      if (
    597        this.blockListManager &&
    598        this.blockListManager.matchAtWordBoundary({
    599          // Blocklist is always lowercase
    600          text: sentence.toLowerCase(),
    601        })
    602      ) {
    603        sentence = "";
    604        abort = true;
    605        this.currentNumSentences = this.maxNumOutputSentences;
    606      }
    607    }
    608 
    609    return { sentence, abort };
    610  }
    611 
    612  /**
    613   * Flushes the remaining text buffer. This ensures that any last remaining
    614   * sentence is returned.
    615   *
    616   * @returns {string} remaining text that hasn't been processed yet
    617   */
    618  flush() {
    619    return this.currentText;
    620  }
    621 }
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE