tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

htmlaccel.h (20728B)


      1 /* This Source Code Form is subject to the terms of the Mozilla Public
      2 * License, v. 2.0. If a copy of the MPL was not distributed with this
      3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      4 
      5 #ifndef mozilla_htmlaccel_htmlaccel_h
      6 #define mozilla_htmlaccel_htmlaccel_h
      7 
      8 #include <string.h>
      9 #include <stdint.h>
     10 
     11 // Avoid adding more Gecko-specific headers to keep it easy enough to
     12 // copy and paste the contents of this file to Compiler Explorer.
     13 #include "mozilla/Attributes.h"
     14 
     15 // This file provides SIMD code for skipping over characters that
     16 // the caller doesn't need to act upon. For example, this code can
     17 // skip over characters that the HTML tokenizer doesn't need to handle
     18 // specially in a given state or this code could be used to skip over
     19 // characters that don't need to be escaped in an HTML serializer.
     20 
     21 // ISA SUPPORT: Do not include this file unless the compilation unit is
     22 // being compiled either for little-endian aarch64 or for x86/x86_64 with
     23 // at least SSSE3 enabled. (We're actually not using this on 32-bit x86
     24 // and are compiling with AVX+BMI on x86_64; see below. In the build
     25 // system, `HTML_ACCEL_FLAGS` contains the actually-used flags.)
     26 //
     27 // It's probably feasible to extend this to support little-endian POWER
     28 // by defining
     29 // MOZ_ALWAYS_INLINE_EVEN_DEBUG uint8x16_t TableLookup(uint8x16_t table,
     30 // uint8x16_t nibbles) {
     31 //  return vec_perm(table, table, nibbles);
     32 // }
     33 // but since I don't have a little-endian POWER system to test with,
     34 // this is left as an exercise to the reader. (The x86_64 reduction
     35 // code should be portable to POWER10 using vec_extractm and the aarch64
     36 // reduction code should be portable to older POWER using vec_max.)
     37 //
     38 // ARMv7 is deliberately not supported due to vqtbl1q_u8 being a newer
     39 // addition to NEON.
     40 #if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__
     41 #  error "A little-endian target is required."
     42 #endif
     43 #if !(defined(__aarch64__) || defined(__SSSE3__))
     44 #  error "Must be targeting SSSE3 or above (notably AVX+BMI), or aarch64."
     45 #endif
     46 
     47 // NOTE: This file uses GCC/clang built-ins that provide SIMD portability.
     48 // Compared to pretending unawareness of what arm_neon.h and tmmintrin.h
     49 // map to in GCC and clang, this has the benefit that the code is not stuck
     50 // at an SSSE3 local maximum but adapts maximally to upgrades to SSE 4.2,
     51 // AVX, and AVX+BMI. (Yes, enabling BMI seems to affect more than just
     52 // __builtin_ctz!)
     53 // (We need to check for __clang__, because clang-cl does not define __GNUC__.)
     54 #if !(defined(__GNUC__) || defined(__clang__))
     55 #  error "A compiler that supports GCC-style portable SIMD is required."
     56 #endif
     57 
     58 // # General
     59 //
     60 // There is an entry point per combination of what characters terminate
     61 // the acceleration loop (i.e. characters that the HTML tokenizer would not
     62 // simply skip over). The shared implementation code is inlined into these
     63 // FFI entry point functions, so the parametrization made inside the FFI
     64 // functions constant-propagates through the implementation internals.
     65 //
     66 // The code examines 16 UTF-16 code units at a time as two 128-bit SIMD
     67 // vectors. First, the bytes are regrouped to so that one SIMD vector
     68 // contains the high halves of the UTF-16 code units (zeros for ASCII/Basic
     69 // Latin) and another one contains the low halves.
     70 //
     71 // In the case of the low half, we mask the vector to take the low 4 bits of
     72 // each 8-bit value and do a lookup from a lookup table contained in a SIMD
     73 // vector. The 4 bits index into 16 lanes of the other SIMD vector such that
     74 // we get a vector where the positions corresponding to positions of the
     75 // original code units contain the 8-bit value looked up from by the 4-bit
     76 // index.
     77 //
     78 // The lookup operation is available unconditionally on aarch64. On
     79 // x86/x86_64, it is part of the SSSE3 instruction set extension, which is
     80 // why on x86/x86_64 we must not call into this code unless SSSE3 is
     81 // available. (Each additional level of compiling this code with SSE4.2,
     82 // AVX, or AVX+BMI makes this code shorter, which presumably means more
     83 // efficient, so instead of compiling this just with SSSE3, we compile this
     84 // with AVX+BMI on x86_64, considering that CPUs with such capabilities
     85 // have been available for 12 years at the time of landing this code.)
     86 //
     87 // The lookup table contains the loop-terminating ASCII characters in the
     88 // positions given by their low 4 bits. For example, the less-than sign is
     89 // U+003C, so the value 0x3C is at index 0xC (decimal 12). Positions that
     90 // don’t correspond to a character of interest have the value 1, except lane
     91 // 1 has the placeholder value 2. This way, characters that we don’t want to
     92 // match anything in the lookup table get a non-matching placeholder: U+0001
     93 // gets compared with 2 (semantically U+0002) and everything else not of
     94 // interest gets compared with 1 (semantically U+0001) to produce a
     95 // non-matching lane.
     96 //
     97 // This means that instead of comparing the vector of the low halves of the
     98 // UTF-16 code units against multiple constant vectors each filled in all
     99 // lanes with a given ASCII character of interest, the table lookup gives us
    100 // one vector to compare against where each lane can have a different ASCII
    101 // character of interest to compare with.
    102 //
    103 // This requires the ASCII characters of interest to have mutually distinct
    104 // low 4 bits. This is true for U+0000, &, <, LF, CR, ", and ', but,
    105 // unfortunately, CR, ] and - share the low 4 bits, so cases where we need
    106 // to include a check for ] or - needs to do a separate check, since CR is
    107 // always in the lookup table. Note that it's not worthwhile to pursue
    108 // the low 5 bits instead when possible, because CR and - share the low
    109 // 5 bits, too.
    110 //
    111 // From these operations, we get a vector of 16 8-bit mask lanes where a
    112 // lane is 0xFF if the low 8 bits of the UTF-16 code unit matched an ASCII
    113 // character that terminates the loop and 0x00 otherwise. We lane-wise
    114 // compare the high halves with zero and AND the resulting mask vector
    115 // together with the mask vector that resulted from processing the low 8
    116 // bits to confirm which low 8 bits had 0 as the high 8 bits, i.e. the
    117 // UTF-16 code unit really was Basic Latin.
    118 //
    119 // If we have a configuration that requires terminating the loop on
    120 // surrogates, we check the vector containing the high halves of the UTF-16
    121 // code units for surrogates (by masking certain high bits to compare them
    122 // with a constant) and OR the resulting mask vector together with the
    123 // vector computed above.
    124 //
    125 // Now we have a vector of 16 8-bit mask lanes that corresponds to the input
    126 // of 16 UTF-16 code units to indicate which code units in the run of 16
    127 // UTF-16 code units require terminating the loop (i.e. must not be skipped
    128 // over). At this point, the handling diverges for x86_64 and aarch64.
    129 //
    130 // ## x86_64
    131 //
    132 // We convert the SIMD mask into bits in an ALU register. The operation
    133 // returns a 32-bit type, but only the low 16 bits can be non-zero. If the
    134 // integer is non-zero, the loop terminates, since some lane in the mask was
    135 // non-zero. In this case, we return the number of trailing zeros in the
    136 // integer. (We already know must have a non-zero bit somewhere in the low
    137 // 16 bits, so we can’t end up counting to the high half of the 32-bit type.)
    138 // Due to the little-endian semantics, the first UTF-16 code unit in the
    139 // input corresponds to the least-significant bit in the integer, so when the
    140 // first UTF-16 code unit in the input is unskippable, the least-significant
    141 // bit in the integer is 1, so there are 0 trailing zeros, i.e. 0 skippable
    142 // UTF-16 code units.
    143 //
    144 // ## aarch64
    145 //
    146 // We want to know if any lane is the mask is non-zero to decide whether to
    147 // terminate the loop. If there is a non-zero lane, we want to know the
    148 // position of the first (in the content order of the input UTF-16 text)
    149 // non-zero lane. To accomplish these goals, we bitwise AND the mask vector
    150 // with a vector of 16 constants. Since ANDing with a mask lane set to zero
    151 // results in zero, we need all 16 constants to be non-zero. Yet, we need to
    152 // be able to accommodate the possibility of first lane in content order
    153 // being set, which means we need to compute 0 as the result. To be able to
    154 // compute 0 but have the constants be non-zero, the constants are numbers
    155 // that need be subtracted from 16. That is, the constant vector has lanes
    156 // set to numbers from 16 to 1 (inclusive). We do the reduction of the
    157 // resulting SIMD vector to an ALU integer by taking the value of the lane
    158 // with the largest value.
    159 //
    160 // If no mask lane was set, the max operation results in 0, so if the
    161 // integer is zero, the loop continues. Otherwise, we get the number of
    162 // skippable UTF-16 code units by subtracting the integer from 16. That is,
    163 // if the first UTF-16 unit is unstoppable, we get 16 as the max lane value
    164 // and 16-16=0.
    165 //
    166 // # Alignment
    167 //
    168 // These functions use unaligned SIMD loads, because alignment
    169 // doesn't matter on aarch64 CPUs or on x86_64 CPUs from the most
    170 // recent decade or so. It's not worthwhile to add complexity for
    171 // old CPUs.
    172 //
    173 // # Inlining
    174 //
    175 // This code was designed for inlining the public functions all the
    176 // way to the caller for maximum LICM. However, due to
    177 // https://github.com/llvm/llvm-project/issues/160886 the public
    178 // functions are currently annotated _not_ to be inlined, because
    179 // currently inlining them into the eventual caller results in
    180 // no LICM but leaving them not-inlined results in one level of
    181 // LICM in the leaf function.
    182 //
    183 // # Acknowledments
    184 //
    185 // https://lemire.me/blog/2024/06/08/scan-html-faster-with-simd-instructions-chrome-edition/
    186 
    187 #if defined(__aarch64__)
    188 
    189 #  include <arm_neon.h>
    190 
    191 #else  // x86/x86_64
    192 
    193 #  include <tmmintrin.h>
    194 // Using syntax that clang-tidy doesn't like to match GCC guidance.
    195 typedef uint8_t uint8x16_t __attribute__((vector_size(16)));
    196 
    197 #endif
    198 
    199 namespace mozilla::htmlaccel {
    200 
    201 namespace detail {
    202 
    203 #if defined(__aarch64__)
    204 // The idea is that when this is ANDed with the mask, we get 0 in the
    205 // non-match positions and the leftmost match ends up with higest number.
    206 // This way, taking the max value of the result is zero if all positions
    207 // are non-match, and otherwise we get a value that when subtracted from
    208 // 16 indicates the index of the leftmost match.
    209 const uint8x16_t INVERTED_ADVANCES = {16, 15, 14, 13, 12, 11, 10, 9,
    210                                      8,  7,  6,  5,  4,  3,  2,  1};
    211 const uint8x16_t ALL_ONES = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
    212 
    213 MOZ_ALWAYS_INLINE_EVEN_DEBUG uint8x16_t TableLookup(uint8x16_t aTable,
    214                                                    uint8x16_t aNibbles) {
    215  return vqtbl1q_u8(aTable, aNibbles);
    216 }
    217 
    218 #else  // x86/x86_64
    219 
    220 MOZ_ALWAYS_INLINE_EVEN_DEBUG uint8x16_t TableLookup(uint8x16_t aTable,
    221                                                    uint8x16_t aNibbles) {
    222  // GCC wants reinterpret_cast
    223  return reinterpret_cast<uint8x16_t>(_mm_shuffle_epi8(aTable, aNibbles));
    224 }
    225 
    226 #endif
    227 
    228 // These formulations optimize nicely, so no point in trying something fancier
    229 // to fill all lanes with the same byte.
    230 const uint8x16_t ALL_ZEROS = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
    231 const uint8x16_t NIBBLE_MASK = {0xF, 0xF, 0xF, 0xF, 0xF, 0xF, 0xF, 0xF,
    232                                0xF, 0xF, 0xF, 0xF, 0xF, 0xF, 0xF, 0xF};
    233 const uint8x16_t SURROGATE_MASK = {0xF8, 0xF8, 0xF8, 0xF8, 0xF8, 0xF8,
    234                                   0xF8, 0xF8, 0xF8, 0xF8, 0xF8, 0xF8,
    235                                   0xF8, 0xF8, 0xF8, 0xF8};
    236 const uint8x16_t SURROGATE_MATCH = {0xD8, 0xD8, 0xD8, 0xD8, 0xD8, 0xD8,
    237                                    0xD8, 0xD8, 0xD8, 0xD8, 0xD8, 0xD8,
    238                                    0xD8, 0xD8, 0xD8, 0xD8};
    239 const uint8x16_t HYPHENS = {'-', '-', '-', '-', '-', '-', '-', '-',
    240                            '-', '-', '-', '-', '-', '-', '-', '-'};
    241 const uint8x16_t RSQBS = {']', ']', ']', ']', ']', ']', ']', ']',
    242                          ']', ']', ']', ']', ']', ']', ']', ']'};
    243 
    244 // The approach here supports disallowing up to 16 different
    245 // characters that 1) are in the Latin1 range, i.e. U+00FF or
    246 // below, and 2) do not have the lowest 4 bits in common with
    247 // each other.
    248 //
    249 // The code point value of each disallowed character needs
    250 // to be placed in the vector at the position indexed by the
    251 // low 4 bits of the character (low four bits 0 is the leftmost
    252 // position and low four bits 15 is the rightmost position).
    253 //
    254 // U+0001 neither occurs in typical HTML nor is one of the
    255 // code points we care about, so use 1 as the non-matching
    256 // value. We do care about U+0000, unfortunately.
    257 // We use U+0002 at position 1 to make sure it doesn't
    258 // match, either. That is, we put 1 in the positions we
    259 // don't care about except we put 2 at position 1.
    260 
    261 /// Disallow U+0000, less-than, ampersand, and carriage return.
    262 const uint8x16_t ZERO_LT_AMP_CR = {0, 2, 1, 1, 1,   1,    '&', 1,
    263                                   1, 1, 1, 1, '<', '\r', 1,   1};
    264 /// Disallow U+0000, less-than, ampersand, carriage return, and line feed.
    265 const uint8x16_t ZERO_LT_AMP_CR_LF = {0, 2, 1,    1, 1,   1,    '&', 1,
    266                                      1, 1, '\n', 1, '<', '\r', 1,   1};
    267 /// Disallow less-than, greater-than, ampersand, and no-break space.
    268 const uint8x16_t LT_GT_AMP_NBSP = {0xA0, 2, 1, 1, 1,   1, '&', 1,
    269                                   1,    1, 1, 1, '<', 1, '>', 1};
    270 /// Disallow less-than, greater-than, ampersand, no-break space, and double
    271 /// quote.
    272 const uint8x16_t LT_GT_AMP_NBSP_QUOT = {0xA0, 2, '"', 1, 1,   1, '&', 1,
    273                                        1,    1, 1,   1, '<', 1, '>', 1};
    274 /// Disallow U+0000, less-than, and carriage return.
    275 const uint8x16_t ZERO_LT_CR = {0, 2, 1, 1, 1,   1,    1, 1,
    276                               1, 1, 1, 1, '<', '\r', 1, 1};
    277 /// Disallow U+0000, less-than, carriage return, and line feed.
    278 const uint8x16_t ZERO_LT_CR_LF = {0, 2, 1,    1, 1,   1,    1, 1,
    279                                  1, 1, '\n', 1, '<', '\r', 1, 1};
    280 /// Disallow U+0000, single quote, ampersand, and carriage return.
    281 const uint8x16_t ZERO_APOS_AMP_CR = {0, 2, 1, 1, 1, 1,    '&', '\'',
    282                                     1, 1, 1, 1, 1, '\r', 1,   1};
    283 /// Disallow U+0000, single quote, ampersand, carriage return, and line feed.
    284 const uint8x16_t ZERO_APOS_AMP_CR_LF = {0, 2, 1,    1, 1, 1,    '&', '\'',
    285                                        1, 1, '\n', 1, 1, '\r', 1,   1};
    286 /// Disallow U+0000, double quote, ampersand, and carriage return.
    287 const uint8x16_t ZERO_QUOT_AMP_CR = {0, 2, '"', 1, 1, 1,    '&', 1,
    288                                     1, 1, 1,   1, 1, '\r', 1,   1};
    289 /// Disallow U+0000, single quote, ampersand, carriage return, and line feed.
    290 const uint8x16_t ZERO_QUOT_AMP_CR_LF = {0, 2, '"',  1, 1, 1,    '&', 1,
    291                                        1, 1, '\n', 1, 1, '\r', 1,   1};
    292 /// Disallow U+0000 and carriage return.
    293 const uint8x16_t ZERO_CR = {0, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, '\r', 1, 1};
    294 /// Disallow U+0000, carriage return, and line feed.
    295 const uint8x16_t ZERO_CR_LF = {0, 2, 1,    1, 1, 1,    1, 1,
    296                               1, 1, '\n', 1, 1, '\r', 1, 1};
    297 
    298 /// Compute a 16-lane mask for for 16 UTF-16 code units, where a lane
    299 /// is 0x00 if OK to skip and 0xFF in not OK to skip.
    300 MOZ_ALWAYS_INLINE_EVEN_DEBUG uint8x16_t
    301 StrideToMask(const char16_t* aArr /* len = 16 */, uint8x16_t aTable,
    302             bool aAllowSurrogates = true, bool aAllowHyphen = true,
    303             bool aAllowRightSquareBracket = true) {
    304  uint8x16_t first;
    305  uint8x16_t second;
    306  // memcpy generates a single unaligned load instruction with both ISAs.
    307  memcpy(&first, aArr, 16);
    308  memcpy(&second, aArr + 8, 16);
    309  // Each shuffle maps to a single instruction on aarch64.
    310  // On x86/x86_64, how efficiently these shuffles maps to instructions
    311  // depends on the level of instruction set extensions chosen, which
    312  // is the main reason that we compile this file at a higher extension
    313  // level than the minimum SSSE3 (and the main reason why this file
    314  // uses GNU C portable SIMD instead of sticking to what's in the
    315  // Intel-defined headers).
    316  uint8x16_t low_halves = __builtin_shufflevector(
    317      first, second, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
    318  uint8x16_t high_halves = __builtin_shufflevector(
    319      first, second, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31);
    320  uint8x16_t high_half_matches = high_halves == ALL_ZEROS;
    321  uint8x16_t low_half_matches =
    322      low_halves == TableLookup(aTable, low_halves & NIBBLE_MASK);
    323  if (!aAllowHyphen) {  // Assumed to be constant-propagated
    324    low_half_matches |= low_halves == HYPHENS;
    325  }
    326  if (!aAllowRightSquareBracket) {  // Assumed to be constant-propagated
    327    low_half_matches |= low_halves == RSQBS;
    328  }
    329  uint8x16_t ret = low_half_matches & high_half_matches;
    330  if (!aAllowSurrogates) {  // Assumed to be constant-propagated
    331    ret |= (high_halves & SURROGATE_MASK) == SURROGATE_MATCH;
    332  }
    333  return ret;
    334 }
    335 
    336 /// Compute a 16-lane mask for for 16 Latin1 code units, where a lane
    337 /// is 0x00 if OK to skip and 0xFF in not OK to skip.
    338 /// The boolean arguments exist for signature compatibility with the UTF-16
    339 /// case and are unused in the Latin1 case.
    340 MOZ_ALWAYS_INLINE_EVEN_DEBUG uint8x16_t
    341 StrideToMask(const char* aArr /* len = 16 */, uint8x16_t aTable,
    342             bool aAllowSurrogates = true, bool aAllowHyphen = true,
    343             bool aAllowRightSquareBracket = true) {
    344  uint8x16_t stride;
    345  // memcpy generates a single unaligned load instruction with both ISAs.
    346  memcpy(&stride, aArr, 16);
    347  // == compares lane-wise and returns a mask vector.
    348  return stride == TableLookup(aTable, stride & NIBBLE_MASK);
    349 }
    350 
    351 template <typename CharT>
    352 MOZ_ALWAYS_INLINE_EVEN_DEBUG size_t
    353 AccelerateTextNode(const CharT* aInput, const CharT* aEnd, uint8x16_t aTable,
    354                   bool aAllowSurrogates = true, bool aAllowHyphen = true,
    355                   bool aAllowRightSquareBracket = true) {
    356  const CharT* current = aInput;
    357  while (aEnd - current >= 16) {
    358    uint8x16_t mask = StrideToMask(current, aTable, aAllowSurrogates,
    359                                   aAllowHyphen, aAllowRightSquareBracket);
    360 #if defined(__aarch64__)
    361    uint8_t max = vmaxvq_u8(mask & INVERTED_ADVANCES);
    362    if (max != 0) {
    363      return size_t((current - aInput) + 16 - max);
    364    }
    365 #else  // x86/x86_64
    366    int int_mask = _mm_movemask_epi8(mask);
    367    if (int_mask != 0) {
    368      // The least-significant bit in the integer corresponds to
    369      // the first SIMD lane in text order. Hence, we need to count
    370      // trailing zeros. We already checked that the bits are not
    371      // all zeros, so __builtin_ctz isn't UB.
    372      return size_t((current - aInput) + __builtin_ctz(int_mask));
    373    }
    374 #endif
    375    current += 16;
    376  }
    377  return size_t(current - aInput);
    378 }
    379 
    380 template <typename CharT>
    381 MOZ_ALWAYS_INLINE_EVEN_DEBUG uint32_t CountEscaped(const CharT* aInput,
    382                                                   const CharT* aEnd,
    383                                                   bool aCountDoubleQuote) {
    384  uint32_t numEncodedChars = 0;
    385  const CharT* current = aInput;
    386  while (aEnd - current >= 16) {
    387    uint8x16_t mask = StrideToMask(
    388        current, aCountDoubleQuote ? LT_GT_AMP_NBSP_QUOT : LT_GT_AMP_NBSP);
    389 #if defined(__aarch64__)
    390    // Reduce on each iteration to avoid branching for overflow avoidance
    391    // on each iteration.
    392    numEncodedChars += vaddvq_u8(mask & ALL_ONES);
    393 #else  // x86_64
    394    numEncodedChars += __builtin_popcount(_mm_movemask_epi8(mask));
    395 #endif
    396    current += 16;
    397  }
    398  while (current != aEnd) {
    399    CharT c = *current;
    400    if ((aCountDoubleQuote && c == CharT('"')) || c == CharT('&') ||
    401        c == CharT('<') || c == CharT('>') || c == CharT(0xA0)) {
    402      ++numEncodedChars;
    403    }
    404    ++current;
    405  }
    406  return numEncodedChars;
    407 }
    408 
    409 MOZ_ALWAYS_INLINE_EVEN_DEBUG bool ContainsMarkup(const char16_t* aInput,
    410                                                 const char16_t* aEnd) {
    411  const char16_t* current = aInput;
    412  while (aEnd - current >= 16) {
    413    uint8x16_t mask = StrideToMask(current, ZERO_LT_AMP_CR);
    414 #if defined(__aarch64__)
    415    uint8_t max = vmaxvq_u8(mask);
    416    if (max != 0) {
    417      return true;
    418    }
    419 #else  // x86/x86_64
    420    int int_mask = _mm_movemask_epi8(mask);
    421    if (int_mask != 0) {
    422      return true;
    423    }
    424 #endif
    425    current += 16;
    426  }
    427  while (current != aEnd) {
    428    char16_t c = *current;
    429    if (c == char16_t('<') || c == char16_t('&') || c == char16_t('\r') ||
    430        c == char16_t('\0')) {
    431      return true;
    432    }
    433    ++current;
    434  }
    435  return false;
    436 }
    437 
    438 }  // namespace detail
    439 
    440 // Public entry points are in htmlaccelNotInline.h for now.
    441 
    442 }  // namespace mozilla::htmlaccel
    443 
    444 #endif  // mozilla_htmlaccel_htmlaccel_h