tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

UrlbarTokenizer.sys.mjs (11523B)


      1 /* This Source Code Form is subject to the terms of the Mozilla Public
      2 * License, v. 2.0. If a copy of the MPL was not distributed with this
      3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      4 
      5 /**
      6 * This module exports a tokenizer to be used by the urlbar model.
      7 * Emitted tokens are objects in the shape { type, value }, where type is one
      8 * of UrlbarTokenizer.TYPE.
      9 */
     10 
     11 const lazy = {};
     12 ChromeUtils.defineESModuleGetters(lazy, {
     13  UrlbarPrefs: "moz-src:///browser/components/urlbar/UrlbarPrefs.sys.mjs",
     14  UrlbarUtils: "moz-src:///browser/components/urlbar/UrlbarUtils.sys.mjs",
     15  PlacesUtils: "resource://gre/modules/PlacesUtils.sys.mjs",
     16  UrlUtils: "resource://gre/modules/UrlUtils.sys.mjs",
     17 });
     18 
     19 ChromeUtils.defineLazyGetter(lazy, "logger", () =>
     20  lazy.UrlbarUtils.getLogger({ prefix: "Tokenizer" })
     21 );
     22 
     23 ChromeUtils.defineLazyGetter(lazy, "gFluentStrings", function () {
     24  return new Localization(["browser/browser.ftl"]);
     25 });
     26 
     27 /**
     28 * @typedef UrlbarSearchStringTokenData
     29 * @property {Values<typeof lazy.UrlbarTokenizer.TYPE>} type
     30 *   The type of the token.
     31 * @property {string} value
     32 *   The value of the token.
     33 * @property {string} lowerCaseValue
     34 *   The lower case version of the value.
     35 */
     36 
     37 /**
     38 * This Map stores key-value pairs where each key is a restrict token
     39 * and each value is an array containing the localized keyword and the
     40 * english keyword.
     41 *
     42 * For example,
     43 * "*" maps to "Bookmarks" for english locales
     44 * "*" maps to "Marcadores, Bookmarks" for es-ES
     45 *
     46 * @type {Map<string, string[]>}
     47 */
     48 let tokenToKeywords = new Map();
     49 
     50 export var UrlbarTokenizer = {
     51  TYPE: {
     52    TEXT: 1,
     53    // `looksLikeOrigin()` returned a value for this token that was neither
     54    // `LOOKS_LIKE_ORIGIN.NONE` nor `LOOKS_LIKE_ORIGIN.OTHER`. It sure looks
     55    // like an origin.
     56    POSSIBLE_ORIGIN: 2,
     57    POSSIBLE_URL: 3, // Consumers should still check this with a fixup.
     58    RESTRICT_HISTORY: 4,
     59    RESTRICT_BOOKMARK: 5,
     60    RESTRICT_TAG: 6,
     61    RESTRICT_OPENPAGE: 7,
     62    RESTRICT_SEARCH: 8,
     63    RESTRICT_TITLE: 9,
     64    RESTRICT_URL: 10,
     65    RESTRICT_ACTION: 11,
     66    // `looksLikeOrigin()` returned `LOOKS_LIKE_ORIGIN.OTHER` for this token. It
     67    // may or may not be an origin.
     68    POSSIBLE_ORIGIN_BUT_SEARCH_ALLOWED: 12,
     69  },
     70 
     71  // The special characters below can be typed into the urlbar to restrict
     72  // the search to a certain category, like history, bookmarks or open pages; or
     73  // to force a match on just the title or url.
     74  // These restriction characters can be typed alone, or at word boundaries,
     75  // provided their meaning cannot be confused, for example # could be present
     76  // in a valid url, and thus it should not be interpreted as a restriction.
     77  RESTRICT: {
     78    HISTORY: "^",
     79    BOOKMARK: "*",
     80    TAG: "+",
     81    OPENPAGE: "%",
     82    SEARCH: "?",
     83    TITLE: "#",
     84    URL: "$",
     85    ACTION: ">",
     86  },
     87 
     88  // The keys of characters in RESTRICT that will enter search mode.
     89  get SEARCH_MODE_RESTRICT() {
     90    const keys = [
     91      this.RESTRICT.HISTORY,
     92      this.RESTRICT.BOOKMARK,
     93      this.RESTRICT.OPENPAGE,
     94      this.RESTRICT.SEARCH,
     95    ];
     96    if (lazy.UrlbarPrefs.get("scotchBonnet.enableOverride")) {
     97      keys.push(this.RESTRICT.ACTION);
     98    }
     99    return new Set(keys);
    100  },
    101 
    102  async loadL10nRestrictKeywords() {
    103    let l10nKeywords = await lazy.gFluentStrings.formatValues(
    104      lazy.UrlbarUtils.LOCAL_SEARCH_MODES.map(mode => {
    105        let name = lazy.UrlbarUtils.getResultSourceName(mode.source);
    106        return { id: `urlbar-search-mode-${name}` };
    107      })
    108    );
    109 
    110    let englishSearchStrings = new Localization([
    111      "preview/enUS-searchFeatures.ftl",
    112    ]);
    113 
    114    let englishKeywords = await englishSearchStrings.formatValues(
    115      lazy.UrlbarUtils.LOCAL_SEARCH_MODES.map(mode => {
    116        let name = lazy.UrlbarUtils.getResultSourceName(mode.source);
    117        return { id: `urlbar-search-mode-${name}-en` };
    118      })
    119    );
    120 
    121    for (let { restrict } of lazy.UrlbarUtils.LOCAL_SEARCH_MODES) {
    122      let uniqueKeywords = [
    123        ...new Set([l10nKeywords.shift(), englishKeywords.shift()]),
    124      ];
    125 
    126      tokenToKeywords.set(restrict, uniqueKeywords);
    127    }
    128  },
    129 
    130  /**
    131   * Gets the cached localized restrict keywords. If keywords are not cached
    132   * fetch the localized keywords first and then return the keywords.
    133   */
    134  async getL10nRestrictKeywords() {
    135    if (tokenToKeywords.size === 0) {
    136      await this.loadL10nRestrictKeywords();
    137    }
    138 
    139    return tokenToKeywords;
    140  },
    141 
    142  /**
    143   * Tokenizes the searchString from a UrlbarQueryContext.
    144   *
    145   * @param {object} context
    146   * @param {string} context.searchString
    147   * @param {string} [context.searchMode]
    148   * @param {string} context.trimmedSearchString
    149   * @returns {UrlbarSearchStringTokenData[]}
    150   *  The tokens associated with the query.
    151   */
    152  tokenize(context) {
    153    lazy.logger.debug("Tokenizing search string", {
    154      searchString: context.searchString,
    155    });
    156    if (!context.trimmedSearchString) {
    157      return [];
    158    }
    159    let unfiltered = splitString(context);
    160    return filterTokens(unfiltered);
    161  },
    162 
    163  /**
    164   * Given a token, tells if it's a restriction token.
    165   *
    166   * @param {object} token
    167   *   The token to check.
    168   * @returns {boolean} Whether the token is a restriction character.
    169   */
    170  isRestrictionToken(token) {
    171    return (
    172      token &&
    173      token.type >= this.TYPE.RESTRICT_HISTORY &&
    174      token.type <= this.TYPE.RESTRICT_URL
    175    );
    176  },
    177 };
    178 
    179 const CHAR_TO_TYPE_MAP = new Map(
    180  Object.entries(UrlbarTokenizer.RESTRICT).map(([type, char]) => [
    181    char,
    182    UrlbarTokenizer.TYPE[`RESTRICT_${type}`],
    183  ])
    184 );
    185 
    186 /**
    187 * Given a queryContext object, splits its searchString into string tokens.
    188 *
    189 * @param {object} context
    190 * @param {string} context.searchString
    191 * @param {string} [context.searchMode]
    192 * @returns {string[]} An array of string tokens.
    193 */
    194 function splitString({ searchString, searchMode }) {
    195  // The first step is splitting on unicode whitespaces. We ignore whitespaces
    196  // if the search string starts with "data:", to better support Web developers
    197  // and compatiblity with other browsers.
    198  let trimmed = searchString.trim();
    199  let tokens;
    200  if (trimmed.startsWith("data:")) {
    201    tokens = [trimmed];
    202  } else if (trimmed.length < 500) {
    203    tokens = trimmed.split(lazy.UrlUtils.REGEXP_SPACES);
    204  } else {
    205    // If the string is very long, tokenizing all of it would be expensive. So
    206    // we only tokenize a part of it, then let the last token become a
    207    // catch-all.
    208    tokens = trimmed.substring(0, 500).split(lazy.UrlUtils.REGEXP_SPACES);
    209    tokens[tokens.length - 1] += trimmed.substring(500);
    210  }
    211 
    212  if (!tokens.length) {
    213    return tokens;
    214  }
    215 
    216  // If there is no separate restriction token, it's possible we have to split
    217  // a token, if it's the first one and it includes a leading restriction char
    218  // or it's the last one and it includes a trailing restriction char.
    219  // This allows to not require the user to add artificial whitespaces to
    220  // enforce restrictions, for example typing questions would restrict to
    221  // search results.
    222  const hasRestrictionToken = tokens.some(t => CHAR_TO_TYPE_MAP.has(t));
    223 
    224  const firstToken = tokens[0];
    225  const isFirstTokenAKeyword =
    226    !Object.values(UrlbarTokenizer.RESTRICT).includes(firstToken) &&
    227    lazy.PlacesUtils.keywords.isKeywordFromCache(firstToken);
    228 
    229  if (hasRestrictionToken || isFirstTokenAKeyword) {
    230    return tokens;
    231  }
    232 
    233  // Check for an unambiguous restriction char at the beginning of the first
    234  // token.
    235  if (
    236    CHAR_TO_TYPE_MAP.has(firstToken[0]) &&
    237    !lazy.UrlUtils.REGEXP_PERCENT_ENCODED_START.test(firstToken) &&
    238    !searchMode
    239  ) {
    240    tokens[0] = firstToken.substring(1);
    241    tokens.splice(0, 0, firstToken[0]);
    242    return tokens;
    243  }
    244 
    245  return tokens;
    246 }
    247 
    248 /**
    249 * Given an array of unfiltered tokens, this function filters them and converts
    250 * to token objects with a type.
    251 *
    252 * @param {Array} tokens
    253 *        An array of strings, representing search tokens.
    254 * @returns {Array} An array of token objects.
    255 * Note: restriction characters are only considered if they appear at the start
    256 *       or at the end of the tokens list. In case of restriction characters
    257 *       conflict, the most external ones win. Leading ones win over trailing
    258 *       ones. Discarded restriction characters are considered text.
    259 */
    260 function filterTokens(tokens) {
    261  let filtered = [];
    262  let restrictions = [];
    263  const isFirstTokenAKeyword =
    264    !Object.values(UrlbarTokenizer.RESTRICT).includes(tokens[0]) &&
    265    lazy.PlacesUtils.keywords.isKeywordFromCache(tokens[0]);
    266 
    267  for (let i = 0; i < tokens.length; ++i) {
    268    let token = tokens[i];
    269    let tokenObj = {
    270      value: token,
    271      lowerCaseValue: token.toLocaleLowerCase(),
    272      type: UrlbarTokenizer.TYPE.TEXT,
    273    };
    274    // For privacy reasons, we don't want to send a data (or other kind of) URI
    275    // to a search engine. So we want to parse any single long token below.
    276    if (tokens.length > 1 && token.length > 500) {
    277      filtered.push(tokenObj);
    278      break;
    279    }
    280 
    281    if (isFirstTokenAKeyword) {
    282      filtered.push(tokenObj);
    283      continue;
    284    }
    285 
    286    let restrictionType = CHAR_TO_TYPE_MAP.get(token);
    287    if (restrictionType) {
    288      restrictions.push({ index: i, type: restrictionType });
    289    } else {
    290      let looksLikeOrigin = lazy.UrlUtils.looksLikeOrigin(token);
    291      if (
    292        looksLikeOrigin == lazy.UrlUtils.LOOKS_LIKE_ORIGIN.OTHER &&
    293        lazy.UrlbarPrefs.get("allowSearchSuggestionsForSimpleOrigins")
    294      ) {
    295        tokenObj.type = UrlbarTokenizer.TYPE.POSSIBLE_ORIGIN_BUT_SEARCH_ALLOWED;
    296      } else if (looksLikeOrigin != lazy.UrlUtils.LOOKS_LIKE_ORIGIN.NONE) {
    297        tokenObj.type = UrlbarTokenizer.TYPE.POSSIBLE_ORIGIN;
    298      } else if (lazy.UrlUtils.looksLikeUrl(token, { requirePath: true })) {
    299        tokenObj.type = UrlbarTokenizer.TYPE.POSSIBLE_URL;
    300      }
    301    }
    302    filtered.push(tokenObj);
    303  }
    304 
    305  // Handle restriction characters.
    306  if (restrictions.length) {
    307    // We can apply two kind of restrictions: type (bookmark, search, ...) and
    308    // matching (url, title). These kind of restrictions can be combined, but we
    309    // can only have one restriction per kind.
    310    let matchingRestrictionFound = false;
    311    let typeRestrictionFound = false;
    312    function assignRestriction(r) {
    313      if (r && !(matchingRestrictionFound && typeRestrictionFound)) {
    314        if (
    315          [
    316            UrlbarTokenizer.TYPE.RESTRICT_TITLE,
    317            UrlbarTokenizer.TYPE.RESTRICT_URL,
    318          ].includes(r.type)
    319        ) {
    320          if (!matchingRestrictionFound) {
    321            matchingRestrictionFound = true;
    322            filtered[r.index].type = r.type;
    323            return true;
    324          }
    325        } else if (!typeRestrictionFound) {
    326          typeRestrictionFound = true;
    327          filtered[r.index].type = r.type;
    328          return true;
    329        }
    330      }
    331      return false;
    332    }
    333 
    334    // Look at the first token.
    335    let found = assignRestriction(restrictions.find(r => r.index == 0));
    336    if (found) {
    337      // If the first token was assigned, look at the next one.
    338      assignRestriction(restrictions.find(r => r.index == 1));
    339    }
    340    // Then look at the last token.
    341    let lastIndex = tokens.length - 1;
    342    found = assignRestriction(restrictions.find(r => r.index == lastIndex));
    343    if (found) {
    344      // If the last token was assigned, look at the previous one.
    345      assignRestriction(restrictions.find(r => r.index == lastIndex - 1));
    346    }
    347  }
    348 
    349  lazy.logger.info("Filtered Tokens", filtered);
    350  return filtered;
    351 }