htmlaccel.h (20728B)
1 /* This Source Code Form is subject to the terms of the Mozilla Public 2 * License, v. 2.0. If a copy of the MPL was not distributed with this 3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 4 5 #ifndef mozilla_htmlaccel_htmlaccel_h 6 #define mozilla_htmlaccel_htmlaccel_h 7 8 #include <string.h> 9 #include <stdint.h> 10 11 // Avoid adding more Gecko-specific headers to keep it easy enough to 12 // copy and paste the contents of this file to Compiler Explorer. 13 #include "mozilla/Attributes.h" 14 15 // This file provides SIMD code for skipping over characters that 16 // the caller doesn't need to act upon. For example, this code can 17 // skip over characters that the HTML tokenizer doesn't need to handle 18 // specially in a given state or this code could be used to skip over 19 // characters that don't need to be escaped in an HTML serializer. 20 21 // ISA SUPPORT: Do not include this file unless the compilation unit is 22 // being compiled either for little-endian aarch64 or for x86/x86_64 with 23 // at least SSSE3 enabled. (We're actually not using this on 32-bit x86 24 // and are compiling with AVX+BMI on x86_64; see below. In the build 25 // system, `HTML_ACCEL_FLAGS` contains the actually-used flags.) 26 // 27 // It's probably feasible to extend this to support little-endian POWER 28 // by defining 29 // MOZ_ALWAYS_INLINE_EVEN_DEBUG uint8x16_t TableLookup(uint8x16_t table, 30 // uint8x16_t nibbles) { 31 // return vec_perm(table, table, nibbles); 32 // } 33 // but since I don't have a little-endian POWER system to test with, 34 // this is left as an exercise to the reader. (The x86_64 reduction 35 // code should be portable to POWER10 using vec_extractm and the aarch64 36 // reduction code should be portable to older POWER using vec_max.) 37 // 38 // ARMv7 is deliberately not supported due to vqtbl1q_u8 being a newer 39 // addition to NEON. 40 #if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ 41 # error "A little-endian target is required." 42 #endif 43 #if !(defined(__aarch64__) || defined(__SSSE3__)) 44 # error "Must be targeting SSSE3 or above (notably AVX+BMI), or aarch64." 45 #endif 46 47 // NOTE: This file uses GCC/clang built-ins that provide SIMD portability. 48 // Compared to pretending unawareness of what arm_neon.h and tmmintrin.h 49 // map to in GCC and clang, this has the benefit that the code is not stuck 50 // at an SSSE3 local maximum but adapts maximally to upgrades to SSE 4.2, 51 // AVX, and AVX+BMI. (Yes, enabling BMI seems to affect more than just 52 // __builtin_ctz!) 53 // (We need to check for __clang__, because clang-cl does not define __GNUC__.) 54 #if !(defined(__GNUC__) || defined(__clang__)) 55 # error "A compiler that supports GCC-style portable SIMD is required." 56 #endif 57 58 // # General 59 // 60 // There is an entry point per combination of what characters terminate 61 // the acceleration loop (i.e. characters that the HTML tokenizer would not 62 // simply skip over). The shared implementation code is inlined into these 63 // FFI entry point functions, so the parametrization made inside the FFI 64 // functions constant-propagates through the implementation internals. 65 // 66 // The code examines 16 UTF-16 code units at a time as two 128-bit SIMD 67 // vectors. First, the bytes are regrouped to so that one SIMD vector 68 // contains the high halves of the UTF-16 code units (zeros for ASCII/Basic 69 // Latin) and another one contains the low halves. 70 // 71 // In the case of the low half, we mask the vector to take the low 4 bits of 72 // each 8-bit value and do a lookup from a lookup table contained in a SIMD 73 // vector. The 4 bits index into 16 lanes of the other SIMD vector such that 74 // we get a vector where the positions corresponding to positions of the 75 // original code units contain the 8-bit value looked up from by the 4-bit 76 // index. 77 // 78 // The lookup operation is available unconditionally on aarch64. On 79 // x86/x86_64, it is part of the SSSE3 instruction set extension, which is 80 // why on x86/x86_64 we must not call into this code unless SSSE3 is 81 // available. (Each additional level of compiling this code with SSE4.2, 82 // AVX, or AVX+BMI makes this code shorter, which presumably means more 83 // efficient, so instead of compiling this just with SSSE3, we compile this 84 // with AVX+BMI on x86_64, considering that CPUs with such capabilities 85 // have been available for 12 years at the time of landing this code.) 86 // 87 // The lookup table contains the loop-terminating ASCII characters in the 88 // positions given by their low 4 bits. For example, the less-than sign is 89 // U+003C, so the value 0x3C is at index 0xC (decimal 12). Positions that 90 // don’t correspond to a character of interest have the value 1, except lane 91 // 1 has the placeholder value 2. This way, characters that we don’t want to 92 // match anything in the lookup table get a non-matching placeholder: U+0001 93 // gets compared with 2 (semantically U+0002) and everything else not of 94 // interest gets compared with 1 (semantically U+0001) to produce a 95 // non-matching lane. 96 // 97 // This means that instead of comparing the vector of the low halves of the 98 // UTF-16 code units against multiple constant vectors each filled in all 99 // lanes with a given ASCII character of interest, the table lookup gives us 100 // one vector to compare against where each lane can have a different ASCII 101 // character of interest to compare with. 102 // 103 // This requires the ASCII characters of interest to have mutually distinct 104 // low 4 bits. This is true for U+0000, &, <, LF, CR, ", and ', but, 105 // unfortunately, CR, ] and - share the low 4 bits, so cases where we need 106 // to include a check for ] or - needs to do a separate check, since CR is 107 // always in the lookup table. Note that it's not worthwhile to pursue 108 // the low 5 bits instead when possible, because CR and - share the low 109 // 5 bits, too. 110 // 111 // From these operations, we get a vector of 16 8-bit mask lanes where a 112 // lane is 0xFF if the low 8 bits of the UTF-16 code unit matched an ASCII 113 // character that terminates the loop and 0x00 otherwise. We lane-wise 114 // compare the high halves with zero and AND the resulting mask vector 115 // together with the mask vector that resulted from processing the low 8 116 // bits to confirm which low 8 bits had 0 as the high 8 bits, i.e. the 117 // UTF-16 code unit really was Basic Latin. 118 // 119 // If we have a configuration that requires terminating the loop on 120 // surrogates, we check the vector containing the high halves of the UTF-16 121 // code units for surrogates (by masking certain high bits to compare them 122 // with a constant) and OR the resulting mask vector together with the 123 // vector computed above. 124 // 125 // Now we have a vector of 16 8-bit mask lanes that corresponds to the input 126 // of 16 UTF-16 code units to indicate which code units in the run of 16 127 // UTF-16 code units require terminating the loop (i.e. must not be skipped 128 // over). At this point, the handling diverges for x86_64 and aarch64. 129 // 130 // ## x86_64 131 // 132 // We convert the SIMD mask into bits in an ALU register. The operation 133 // returns a 32-bit type, but only the low 16 bits can be non-zero. If the 134 // integer is non-zero, the loop terminates, since some lane in the mask was 135 // non-zero. In this case, we return the number of trailing zeros in the 136 // integer. (We already know must have a non-zero bit somewhere in the low 137 // 16 bits, so we can’t end up counting to the high half of the 32-bit type.) 138 // Due to the little-endian semantics, the first UTF-16 code unit in the 139 // input corresponds to the least-significant bit in the integer, so when the 140 // first UTF-16 code unit in the input is unskippable, the least-significant 141 // bit in the integer is 1, so there are 0 trailing zeros, i.e. 0 skippable 142 // UTF-16 code units. 143 // 144 // ## aarch64 145 // 146 // We want to know if any lane is the mask is non-zero to decide whether to 147 // terminate the loop. If there is a non-zero lane, we want to know the 148 // position of the first (in the content order of the input UTF-16 text) 149 // non-zero lane. To accomplish these goals, we bitwise AND the mask vector 150 // with a vector of 16 constants. Since ANDing with a mask lane set to zero 151 // results in zero, we need all 16 constants to be non-zero. Yet, we need to 152 // be able to accommodate the possibility of first lane in content order 153 // being set, which means we need to compute 0 as the result. To be able to 154 // compute 0 but have the constants be non-zero, the constants are numbers 155 // that need be subtracted from 16. That is, the constant vector has lanes 156 // set to numbers from 16 to 1 (inclusive). We do the reduction of the 157 // resulting SIMD vector to an ALU integer by taking the value of the lane 158 // with the largest value. 159 // 160 // If no mask lane was set, the max operation results in 0, so if the 161 // integer is zero, the loop continues. Otherwise, we get the number of 162 // skippable UTF-16 code units by subtracting the integer from 16. That is, 163 // if the first UTF-16 unit is unstoppable, we get 16 as the max lane value 164 // and 16-16=0. 165 // 166 // # Alignment 167 // 168 // These functions use unaligned SIMD loads, because alignment 169 // doesn't matter on aarch64 CPUs or on x86_64 CPUs from the most 170 // recent decade or so. It's not worthwhile to add complexity for 171 // old CPUs. 172 // 173 // # Inlining 174 // 175 // This code was designed for inlining the public functions all the 176 // way to the caller for maximum LICM. However, due to 177 // https://github.com/llvm/llvm-project/issues/160886 the public 178 // functions are currently annotated _not_ to be inlined, because 179 // currently inlining them into the eventual caller results in 180 // no LICM but leaving them not-inlined results in one level of 181 // LICM in the leaf function. 182 // 183 // # Acknowledments 184 // 185 // https://lemire.me/blog/2024/06/08/scan-html-faster-with-simd-instructions-chrome-edition/ 186 187 #if defined(__aarch64__) 188 189 # include <arm_neon.h> 190 191 #else // x86/x86_64 192 193 # include <tmmintrin.h> 194 // Using syntax that clang-tidy doesn't like to match GCC guidance. 195 typedef uint8_t uint8x16_t __attribute__((vector_size(16))); 196 197 #endif 198 199 namespace mozilla::htmlaccel { 200 201 namespace detail { 202 203 #if defined(__aarch64__) 204 // The idea is that when this is ANDed with the mask, we get 0 in the 205 // non-match positions and the leftmost match ends up with higest number. 206 // This way, taking the max value of the result is zero if all positions 207 // are non-match, and otherwise we get a value that when subtracted from 208 // 16 indicates the index of the leftmost match. 209 const uint8x16_t INVERTED_ADVANCES = {16, 15, 14, 13, 12, 11, 10, 9, 210 8, 7, 6, 5, 4, 3, 2, 1}; 211 const uint8x16_t ALL_ONES = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; 212 213 MOZ_ALWAYS_INLINE_EVEN_DEBUG uint8x16_t TableLookup(uint8x16_t aTable, 214 uint8x16_t aNibbles) { 215 return vqtbl1q_u8(aTable, aNibbles); 216 } 217 218 #else // x86/x86_64 219 220 MOZ_ALWAYS_INLINE_EVEN_DEBUG uint8x16_t TableLookup(uint8x16_t aTable, 221 uint8x16_t aNibbles) { 222 // GCC wants reinterpret_cast 223 return reinterpret_cast<uint8x16_t>(_mm_shuffle_epi8(aTable, aNibbles)); 224 } 225 226 #endif 227 228 // These formulations optimize nicely, so no point in trying something fancier 229 // to fill all lanes with the same byte. 230 const uint8x16_t ALL_ZEROS = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 231 const uint8x16_t NIBBLE_MASK = {0xF, 0xF, 0xF, 0xF, 0xF, 0xF, 0xF, 0xF, 232 0xF, 0xF, 0xF, 0xF, 0xF, 0xF, 0xF, 0xF}; 233 const uint8x16_t SURROGATE_MASK = {0xF8, 0xF8, 0xF8, 0xF8, 0xF8, 0xF8, 234 0xF8, 0xF8, 0xF8, 0xF8, 0xF8, 0xF8, 235 0xF8, 0xF8, 0xF8, 0xF8}; 236 const uint8x16_t SURROGATE_MATCH = {0xD8, 0xD8, 0xD8, 0xD8, 0xD8, 0xD8, 237 0xD8, 0xD8, 0xD8, 0xD8, 0xD8, 0xD8, 238 0xD8, 0xD8, 0xD8, 0xD8}; 239 const uint8x16_t HYPHENS = {'-', '-', '-', '-', '-', '-', '-', '-', 240 '-', '-', '-', '-', '-', '-', '-', '-'}; 241 const uint8x16_t RSQBS = {']', ']', ']', ']', ']', ']', ']', ']', 242 ']', ']', ']', ']', ']', ']', ']', ']'}; 243 244 // The approach here supports disallowing up to 16 different 245 // characters that 1) are in the Latin1 range, i.e. U+00FF or 246 // below, and 2) do not have the lowest 4 bits in common with 247 // each other. 248 // 249 // The code point value of each disallowed character needs 250 // to be placed in the vector at the position indexed by the 251 // low 4 bits of the character (low four bits 0 is the leftmost 252 // position and low four bits 15 is the rightmost position). 253 // 254 // U+0001 neither occurs in typical HTML nor is one of the 255 // code points we care about, so use 1 as the non-matching 256 // value. We do care about U+0000, unfortunately. 257 // We use U+0002 at position 1 to make sure it doesn't 258 // match, either. That is, we put 1 in the positions we 259 // don't care about except we put 2 at position 1. 260 261 /// Disallow U+0000, less-than, ampersand, and carriage return. 262 const uint8x16_t ZERO_LT_AMP_CR = {0, 2, 1, 1, 1, 1, '&', 1, 263 1, 1, 1, 1, '<', '\r', 1, 1}; 264 /// Disallow U+0000, less-than, ampersand, carriage return, and line feed. 265 const uint8x16_t ZERO_LT_AMP_CR_LF = {0, 2, 1, 1, 1, 1, '&', 1, 266 1, 1, '\n', 1, '<', '\r', 1, 1}; 267 /// Disallow less-than, greater-than, ampersand, and no-break space. 268 const uint8x16_t LT_GT_AMP_NBSP = {0xA0, 2, 1, 1, 1, 1, '&', 1, 269 1, 1, 1, 1, '<', 1, '>', 1}; 270 /// Disallow less-than, greater-than, ampersand, no-break space, and double 271 /// quote. 272 const uint8x16_t LT_GT_AMP_NBSP_QUOT = {0xA0, 2, '"', 1, 1, 1, '&', 1, 273 1, 1, 1, 1, '<', 1, '>', 1}; 274 /// Disallow U+0000, less-than, and carriage return. 275 const uint8x16_t ZERO_LT_CR = {0, 2, 1, 1, 1, 1, 1, 1, 276 1, 1, 1, 1, '<', '\r', 1, 1}; 277 /// Disallow U+0000, less-than, carriage return, and line feed. 278 const uint8x16_t ZERO_LT_CR_LF = {0, 2, 1, 1, 1, 1, 1, 1, 279 1, 1, '\n', 1, '<', '\r', 1, 1}; 280 /// Disallow U+0000, single quote, ampersand, and carriage return. 281 const uint8x16_t ZERO_APOS_AMP_CR = {0, 2, 1, 1, 1, 1, '&', '\'', 282 1, 1, 1, 1, 1, '\r', 1, 1}; 283 /// Disallow U+0000, single quote, ampersand, carriage return, and line feed. 284 const uint8x16_t ZERO_APOS_AMP_CR_LF = {0, 2, 1, 1, 1, 1, '&', '\'', 285 1, 1, '\n', 1, 1, '\r', 1, 1}; 286 /// Disallow U+0000, double quote, ampersand, and carriage return. 287 const uint8x16_t ZERO_QUOT_AMP_CR = {0, 2, '"', 1, 1, 1, '&', 1, 288 1, 1, 1, 1, 1, '\r', 1, 1}; 289 /// Disallow U+0000, single quote, ampersand, carriage return, and line feed. 290 const uint8x16_t ZERO_QUOT_AMP_CR_LF = {0, 2, '"', 1, 1, 1, '&', 1, 291 1, 1, '\n', 1, 1, '\r', 1, 1}; 292 /// Disallow U+0000 and carriage return. 293 const uint8x16_t ZERO_CR = {0, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, '\r', 1, 1}; 294 /// Disallow U+0000, carriage return, and line feed. 295 const uint8x16_t ZERO_CR_LF = {0, 2, 1, 1, 1, 1, 1, 1, 296 1, 1, '\n', 1, 1, '\r', 1, 1}; 297 298 /// Compute a 16-lane mask for for 16 UTF-16 code units, where a lane 299 /// is 0x00 if OK to skip and 0xFF in not OK to skip. 300 MOZ_ALWAYS_INLINE_EVEN_DEBUG uint8x16_t 301 StrideToMask(const char16_t* aArr /* len = 16 */, uint8x16_t aTable, 302 bool aAllowSurrogates = true, bool aAllowHyphen = true, 303 bool aAllowRightSquareBracket = true) { 304 uint8x16_t first; 305 uint8x16_t second; 306 // memcpy generates a single unaligned load instruction with both ISAs. 307 memcpy(&first, aArr, 16); 308 memcpy(&second, aArr + 8, 16); 309 // Each shuffle maps to a single instruction on aarch64. 310 // On x86/x86_64, how efficiently these shuffles maps to instructions 311 // depends on the level of instruction set extensions chosen, which 312 // is the main reason that we compile this file at a higher extension 313 // level than the minimum SSSE3 (and the main reason why this file 314 // uses GNU C portable SIMD instead of sticking to what's in the 315 // Intel-defined headers). 316 uint8x16_t low_halves = __builtin_shufflevector( 317 first, second, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30); 318 uint8x16_t high_halves = __builtin_shufflevector( 319 first, second, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31); 320 uint8x16_t high_half_matches = high_halves == ALL_ZEROS; 321 uint8x16_t low_half_matches = 322 low_halves == TableLookup(aTable, low_halves & NIBBLE_MASK); 323 if (!aAllowHyphen) { // Assumed to be constant-propagated 324 low_half_matches |= low_halves == HYPHENS; 325 } 326 if (!aAllowRightSquareBracket) { // Assumed to be constant-propagated 327 low_half_matches |= low_halves == RSQBS; 328 } 329 uint8x16_t ret = low_half_matches & high_half_matches; 330 if (!aAllowSurrogates) { // Assumed to be constant-propagated 331 ret |= (high_halves & SURROGATE_MASK) == SURROGATE_MATCH; 332 } 333 return ret; 334 } 335 336 /// Compute a 16-lane mask for for 16 Latin1 code units, where a lane 337 /// is 0x00 if OK to skip and 0xFF in not OK to skip. 338 /// The boolean arguments exist for signature compatibility with the UTF-16 339 /// case and are unused in the Latin1 case. 340 MOZ_ALWAYS_INLINE_EVEN_DEBUG uint8x16_t 341 StrideToMask(const char* aArr /* len = 16 */, uint8x16_t aTable, 342 bool aAllowSurrogates = true, bool aAllowHyphen = true, 343 bool aAllowRightSquareBracket = true) { 344 uint8x16_t stride; 345 // memcpy generates a single unaligned load instruction with both ISAs. 346 memcpy(&stride, aArr, 16); 347 // == compares lane-wise and returns a mask vector. 348 return stride == TableLookup(aTable, stride & NIBBLE_MASK); 349 } 350 351 template <typename CharT> 352 MOZ_ALWAYS_INLINE_EVEN_DEBUG size_t 353 AccelerateTextNode(const CharT* aInput, const CharT* aEnd, uint8x16_t aTable, 354 bool aAllowSurrogates = true, bool aAllowHyphen = true, 355 bool aAllowRightSquareBracket = true) { 356 const CharT* current = aInput; 357 while (aEnd - current >= 16) { 358 uint8x16_t mask = StrideToMask(current, aTable, aAllowSurrogates, 359 aAllowHyphen, aAllowRightSquareBracket); 360 #if defined(__aarch64__) 361 uint8_t max = vmaxvq_u8(mask & INVERTED_ADVANCES); 362 if (max != 0) { 363 return size_t((current - aInput) + 16 - max); 364 } 365 #else // x86/x86_64 366 int int_mask = _mm_movemask_epi8(mask); 367 if (int_mask != 0) { 368 // The least-significant bit in the integer corresponds to 369 // the first SIMD lane in text order. Hence, we need to count 370 // trailing zeros. We already checked that the bits are not 371 // all zeros, so __builtin_ctz isn't UB. 372 return size_t((current - aInput) + __builtin_ctz(int_mask)); 373 } 374 #endif 375 current += 16; 376 } 377 return size_t(current - aInput); 378 } 379 380 template <typename CharT> 381 MOZ_ALWAYS_INLINE_EVEN_DEBUG uint32_t CountEscaped(const CharT* aInput, 382 const CharT* aEnd, 383 bool aCountDoubleQuote) { 384 uint32_t numEncodedChars = 0; 385 const CharT* current = aInput; 386 while (aEnd - current >= 16) { 387 uint8x16_t mask = StrideToMask( 388 current, aCountDoubleQuote ? LT_GT_AMP_NBSP_QUOT : LT_GT_AMP_NBSP); 389 #if defined(__aarch64__) 390 // Reduce on each iteration to avoid branching for overflow avoidance 391 // on each iteration. 392 numEncodedChars += vaddvq_u8(mask & ALL_ONES); 393 #else // x86_64 394 numEncodedChars += __builtin_popcount(_mm_movemask_epi8(mask)); 395 #endif 396 current += 16; 397 } 398 while (current != aEnd) { 399 CharT c = *current; 400 if ((aCountDoubleQuote && c == CharT('"')) || c == CharT('&') || 401 c == CharT('<') || c == CharT('>') || c == CharT(0xA0)) { 402 ++numEncodedChars; 403 } 404 ++current; 405 } 406 return numEncodedChars; 407 } 408 409 MOZ_ALWAYS_INLINE_EVEN_DEBUG bool ContainsMarkup(const char16_t* aInput, 410 const char16_t* aEnd) { 411 const char16_t* current = aInput; 412 while (aEnd - current >= 16) { 413 uint8x16_t mask = StrideToMask(current, ZERO_LT_AMP_CR); 414 #if defined(__aarch64__) 415 uint8_t max = vmaxvq_u8(mask); 416 if (max != 0) { 417 return true; 418 } 419 #else // x86/x86_64 420 int int_mask = _mm_movemask_epi8(mask); 421 if (int_mask != 0) { 422 return true; 423 } 424 #endif 425 current += 16; 426 } 427 while (current != aEnd) { 428 char16_t c = *current; 429 if (c == char16_t('<') || c == char16_t('&') || c == char16_t('\r') || 430 c == char16_t('\0')) { 431 return true; 432 } 433 ++current; 434 } 435 return false; 436 } 437 438 } // namespace detail 439 440 // Public entry points are in htmlaccelNotInline.h for now. 441 442 } // namespace mozilla::htmlaccel 443 444 #endif // mozilla_htmlaccel_htmlaccel_h