UrlbarTokenizer.sys.mjs (11523B)
1 /* This Source Code Form is subject to the terms of the Mozilla Public 2 * License, v. 2.0. If a copy of the MPL was not distributed with this 3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 4 5 /** 6 * This module exports a tokenizer to be used by the urlbar model. 7 * Emitted tokens are objects in the shape { type, value }, where type is one 8 * of UrlbarTokenizer.TYPE. 9 */ 10 11 const lazy = {}; 12 ChromeUtils.defineESModuleGetters(lazy, { 13 UrlbarPrefs: "moz-src:///browser/components/urlbar/UrlbarPrefs.sys.mjs", 14 UrlbarUtils: "moz-src:///browser/components/urlbar/UrlbarUtils.sys.mjs", 15 PlacesUtils: "resource://gre/modules/PlacesUtils.sys.mjs", 16 UrlUtils: "resource://gre/modules/UrlUtils.sys.mjs", 17 }); 18 19 ChromeUtils.defineLazyGetter(lazy, "logger", () => 20 lazy.UrlbarUtils.getLogger({ prefix: "Tokenizer" }) 21 ); 22 23 ChromeUtils.defineLazyGetter(lazy, "gFluentStrings", function () { 24 return new Localization(["browser/browser.ftl"]); 25 }); 26 27 /** 28 * @typedef UrlbarSearchStringTokenData 29 * @property {Values<typeof lazy.UrlbarTokenizer.TYPE>} type 30 * The type of the token. 31 * @property {string} value 32 * The value of the token. 33 * @property {string} lowerCaseValue 34 * The lower case version of the value. 35 */ 36 37 /** 38 * This Map stores key-value pairs where each key is a restrict token 39 * and each value is an array containing the localized keyword and the 40 * english keyword. 41 * 42 * For example, 43 * "*" maps to "Bookmarks" for english locales 44 * "*" maps to "Marcadores, Bookmarks" for es-ES 45 * 46 * @type {Map<string, string[]>} 47 */ 48 let tokenToKeywords = new Map(); 49 50 export var UrlbarTokenizer = { 51 TYPE: { 52 TEXT: 1, 53 // `looksLikeOrigin()` returned a value for this token that was neither 54 // `LOOKS_LIKE_ORIGIN.NONE` nor `LOOKS_LIKE_ORIGIN.OTHER`. It sure looks 55 // like an origin. 56 POSSIBLE_ORIGIN: 2, 57 POSSIBLE_URL: 3, // Consumers should still check this with a fixup. 58 RESTRICT_HISTORY: 4, 59 RESTRICT_BOOKMARK: 5, 60 RESTRICT_TAG: 6, 61 RESTRICT_OPENPAGE: 7, 62 RESTRICT_SEARCH: 8, 63 RESTRICT_TITLE: 9, 64 RESTRICT_URL: 10, 65 RESTRICT_ACTION: 11, 66 // `looksLikeOrigin()` returned `LOOKS_LIKE_ORIGIN.OTHER` for this token. It 67 // may or may not be an origin. 68 POSSIBLE_ORIGIN_BUT_SEARCH_ALLOWED: 12, 69 }, 70 71 // The special characters below can be typed into the urlbar to restrict 72 // the search to a certain category, like history, bookmarks or open pages; or 73 // to force a match on just the title or url. 74 // These restriction characters can be typed alone, or at word boundaries, 75 // provided their meaning cannot be confused, for example # could be present 76 // in a valid url, and thus it should not be interpreted as a restriction. 77 RESTRICT: { 78 HISTORY: "^", 79 BOOKMARK: "*", 80 TAG: "+", 81 OPENPAGE: "%", 82 SEARCH: "?", 83 TITLE: "#", 84 URL: "$", 85 ACTION: ">", 86 }, 87 88 // The keys of characters in RESTRICT that will enter search mode. 89 get SEARCH_MODE_RESTRICT() { 90 const keys = [ 91 this.RESTRICT.HISTORY, 92 this.RESTRICT.BOOKMARK, 93 this.RESTRICT.OPENPAGE, 94 this.RESTRICT.SEARCH, 95 ]; 96 if (lazy.UrlbarPrefs.get("scotchBonnet.enableOverride")) { 97 keys.push(this.RESTRICT.ACTION); 98 } 99 return new Set(keys); 100 }, 101 102 async loadL10nRestrictKeywords() { 103 let l10nKeywords = await lazy.gFluentStrings.formatValues( 104 lazy.UrlbarUtils.LOCAL_SEARCH_MODES.map(mode => { 105 let name = lazy.UrlbarUtils.getResultSourceName(mode.source); 106 return { id: `urlbar-search-mode-${name}` }; 107 }) 108 ); 109 110 let englishSearchStrings = new Localization([ 111 "preview/enUS-searchFeatures.ftl", 112 ]); 113 114 let englishKeywords = await englishSearchStrings.formatValues( 115 lazy.UrlbarUtils.LOCAL_SEARCH_MODES.map(mode => { 116 let name = lazy.UrlbarUtils.getResultSourceName(mode.source); 117 return { id: `urlbar-search-mode-${name}-en` }; 118 }) 119 ); 120 121 for (let { restrict } of lazy.UrlbarUtils.LOCAL_SEARCH_MODES) { 122 let uniqueKeywords = [ 123 ...new Set([l10nKeywords.shift(), englishKeywords.shift()]), 124 ]; 125 126 tokenToKeywords.set(restrict, uniqueKeywords); 127 } 128 }, 129 130 /** 131 * Gets the cached localized restrict keywords. If keywords are not cached 132 * fetch the localized keywords first and then return the keywords. 133 */ 134 async getL10nRestrictKeywords() { 135 if (tokenToKeywords.size === 0) { 136 await this.loadL10nRestrictKeywords(); 137 } 138 139 return tokenToKeywords; 140 }, 141 142 /** 143 * Tokenizes the searchString from a UrlbarQueryContext. 144 * 145 * @param {object} context 146 * @param {string} context.searchString 147 * @param {string} [context.searchMode] 148 * @param {string} context.trimmedSearchString 149 * @returns {UrlbarSearchStringTokenData[]} 150 * The tokens associated with the query. 151 */ 152 tokenize(context) { 153 lazy.logger.debug("Tokenizing search string", { 154 searchString: context.searchString, 155 }); 156 if (!context.trimmedSearchString) { 157 return []; 158 } 159 let unfiltered = splitString(context); 160 return filterTokens(unfiltered); 161 }, 162 163 /** 164 * Given a token, tells if it's a restriction token. 165 * 166 * @param {object} token 167 * The token to check. 168 * @returns {boolean} Whether the token is a restriction character. 169 */ 170 isRestrictionToken(token) { 171 return ( 172 token && 173 token.type >= this.TYPE.RESTRICT_HISTORY && 174 token.type <= this.TYPE.RESTRICT_URL 175 ); 176 }, 177 }; 178 179 const CHAR_TO_TYPE_MAP = new Map( 180 Object.entries(UrlbarTokenizer.RESTRICT).map(([type, char]) => [ 181 char, 182 UrlbarTokenizer.TYPE[`RESTRICT_${type}`], 183 ]) 184 ); 185 186 /** 187 * Given a queryContext object, splits its searchString into string tokens. 188 * 189 * @param {object} context 190 * @param {string} context.searchString 191 * @param {string} [context.searchMode] 192 * @returns {string[]} An array of string tokens. 193 */ 194 function splitString({ searchString, searchMode }) { 195 // The first step is splitting on unicode whitespaces. We ignore whitespaces 196 // if the search string starts with "data:", to better support Web developers 197 // and compatiblity with other browsers. 198 let trimmed = searchString.trim(); 199 let tokens; 200 if (trimmed.startsWith("data:")) { 201 tokens = [trimmed]; 202 } else if (trimmed.length < 500) { 203 tokens = trimmed.split(lazy.UrlUtils.REGEXP_SPACES); 204 } else { 205 // If the string is very long, tokenizing all of it would be expensive. So 206 // we only tokenize a part of it, then let the last token become a 207 // catch-all. 208 tokens = trimmed.substring(0, 500).split(lazy.UrlUtils.REGEXP_SPACES); 209 tokens[tokens.length - 1] += trimmed.substring(500); 210 } 211 212 if (!tokens.length) { 213 return tokens; 214 } 215 216 // If there is no separate restriction token, it's possible we have to split 217 // a token, if it's the first one and it includes a leading restriction char 218 // or it's the last one and it includes a trailing restriction char. 219 // This allows to not require the user to add artificial whitespaces to 220 // enforce restrictions, for example typing questions would restrict to 221 // search results. 222 const hasRestrictionToken = tokens.some(t => CHAR_TO_TYPE_MAP.has(t)); 223 224 const firstToken = tokens[0]; 225 const isFirstTokenAKeyword = 226 !Object.values(UrlbarTokenizer.RESTRICT).includes(firstToken) && 227 lazy.PlacesUtils.keywords.isKeywordFromCache(firstToken); 228 229 if (hasRestrictionToken || isFirstTokenAKeyword) { 230 return tokens; 231 } 232 233 // Check for an unambiguous restriction char at the beginning of the first 234 // token. 235 if ( 236 CHAR_TO_TYPE_MAP.has(firstToken[0]) && 237 !lazy.UrlUtils.REGEXP_PERCENT_ENCODED_START.test(firstToken) && 238 !searchMode 239 ) { 240 tokens[0] = firstToken.substring(1); 241 tokens.splice(0, 0, firstToken[0]); 242 return tokens; 243 } 244 245 return tokens; 246 } 247 248 /** 249 * Given an array of unfiltered tokens, this function filters them and converts 250 * to token objects with a type. 251 * 252 * @param {Array} tokens 253 * An array of strings, representing search tokens. 254 * @returns {Array} An array of token objects. 255 * Note: restriction characters are only considered if they appear at the start 256 * or at the end of the tokens list. In case of restriction characters 257 * conflict, the most external ones win. Leading ones win over trailing 258 * ones. Discarded restriction characters are considered text. 259 */ 260 function filterTokens(tokens) { 261 let filtered = []; 262 let restrictions = []; 263 const isFirstTokenAKeyword = 264 !Object.values(UrlbarTokenizer.RESTRICT).includes(tokens[0]) && 265 lazy.PlacesUtils.keywords.isKeywordFromCache(tokens[0]); 266 267 for (let i = 0; i < tokens.length; ++i) { 268 let token = tokens[i]; 269 let tokenObj = { 270 value: token, 271 lowerCaseValue: token.toLocaleLowerCase(), 272 type: UrlbarTokenizer.TYPE.TEXT, 273 }; 274 // For privacy reasons, we don't want to send a data (or other kind of) URI 275 // to a search engine. So we want to parse any single long token below. 276 if (tokens.length > 1 && token.length > 500) { 277 filtered.push(tokenObj); 278 break; 279 } 280 281 if (isFirstTokenAKeyword) { 282 filtered.push(tokenObj); 283 continue; 284 } 285 286 let restrictionType = CHAR_TO_TYPE_MAP.get(token); 287 if (restrictionType) { 288 restrictions.push({ index: i, type: restrictionType }); 289 } else { 290 let looksLikeOrigin = lazy.UrlUtils.looksLikeOrigin(token); 291 if ( 292 looksLikeOrigin == lazy.UrlUtils.LOOKS_LIKE_ORIGIN.OTHER && 293 lazy.UrlbarPrefs.get("allowSearchSuggestionsForSimpleOrigins") 294 ) { 295 tokenObj.type = UrlbarTokenizer.TYPE.POSSIBLE_ORIGIN_BUT_SEARCH_ALLOWED; 296 } else if (looksLikeOrigin != lazy.UrlUtils.LOOKS_LIKE_ORIGIN.NONE) { 297 tokenObj.type = UrlbarTokenizer.TYPE.POSSIBLE_ORIGIN; 298 } else if (lazy.UrlUtils.looksLikeUrl(token, { requirePath: true })) { 299 tokenObj.type = UrlbarTokenizer.TYPE.POSSIBLE_URL; 300 } 301 } 302 filtered.push(tokenObj); 303 } 304 305 // Handle restriction characters. 306 if (restrictions.length) { 307 // We can apply two kind of restrictions: type (bookmark, search, ...) and 308 // matching (url, title). These kind of restrictions can be combined, but we 309 // can only have one restriction per kind. 310 let matchingRestrictionFound = false; 311 let typeRestrictionFound = false; 312 function assignRestriction(r) { 313 if (r && !(matchingRestrictionFound && typeRestrictionFound)) { 314 if ( 315 [ 316 UrlbarTokenizer.TYPE.RESTRICT_TITLE, 317 UrlbarTokenizer.TYPE.RESTRICT_URL, 318 ].includes(r.type) 319 ) { 320 if (!matchingRestrictionFound) { 321 matchingRestrictionFound = true; 322 filtered[r.index].type = r.type; 323 return true; 324 } 325 } else if (!typeRestrictionFound) { 326 typeRestrictionFound = true; 327 filtered[r.index].type = r.type; 328 return true; 329 } 330 } 331 return false; 332 } 333 334 // Look at the first token. 335 let found = assignRestriction(restrictions.find(r => r.index == 0)); 336 if (found) { 337 // If the first token was assigned, look at the next one. 338 assignRestriction(restrictions.find(r => r.index == 1)); 339 } 340 // Then look at the last token. 341 let lastIndex = tokens.length - 1; 342 found = assignRestriction(restrictions.find(r => r.index == lastIndex)); 343 if (found) { 344 // If the last token was assigned, look at the previous one. 345 assignRestriction(restrictions.find(r => r.index == lastIndex - 1)); 346 } 347 } 348 349 lazy.logger.info("Filtered Tokens", filtered); 350 return filtered; 351 }