tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

Unicode.h (16658B)


      1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
      2 * vim: set ts=8 sts=2 et sw=2 tw=80:
      3 * This Source Code Form is subject to the terms of the Mozilla Public
      4 * License, v. 2.0. If a copy of the MPL was not distributed with this
      5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      6 
      7 #ifndef util_Unicode_h
      8 #define util_Unicode_h
      9 
     10 #include "mozilla/Casting.h"  // mozilla::AssertedCast
     11 
     12 #include "jspubtd.h"
     13 
     14 #include "util/UnicodeNonBMP.h"
     15 
     16 namespace js {
     17 namespace unicode {
     18 
     19 extern const bool js_isidstart[];
     20 extern const bool js_isident[];
     21 extern const bool js_isspace[];
     22 
     23 /*
     24 * This namespace contains all the knowledge required to handle Unicode
     25 * characters in JavaScript.
     26 *
     27 * SPACE
     28 *   Every character that is either in the ECMAScript class WhiteSpace
     29 *   (ES2016, § 11.2) or in LineTerminator (ES2016, § 11.3).
     30 *
     31 *   WhiteSpace
     32 *    \u0009, \u000B, \u000C, \u0020, \u00A0 and \uFEFF
     33 *    and every other Unicode character with the General Category "Zs".
     34 *    See <http://www.unicode.org/reports/tr44/#UnicodeData.txt> for more
     35 *    information about General Categories and the UnicodeData.txt file.
     36 *
     37 *   LineTerminator
     38 *    \u000A, \u000D, \u2028, \u2029
     39 *
     40 * UNICODE_ID_START
     41 *   These are all characters with the Unicode property «ID_Start».
     42 *
     43 * UNICODE_ID_CONTINUE_ONLY
     44 *   These are all characters with the Unicode property «ID_Continue» minus all
     45 *   characters with the Unicode property «ID_Start».
     46 *   And additionally <ZWNJ> and <ZWJ>. (ES2016, § 11.6)
     47 *
     48 * UNICODE_ID_CONTINUE
     49 *   These are all characters with the Unicode property «ID_Continue».
     50 *   And additionally <ZWNJ> and <ZWJ>. (ES2016, § 11.6)
     51 *
     52 *   Attention: UNICODE_ID_START is _not_ IdentifierStart, but you could build
     53 *   a matcher for the real IdentifierPart like this:
     54 *
     55 *   if char in ['$', '_']:
     56 *      return True
     57 *   if GetFlag(char) & UNICODE_ID_CONTINUE:
     58 *      return True
     59 *
     60 */
     61 
     62 namespace CharFlag {
     63 const uint8_t SPACE = 1 << 0;
     64 const uint8_t UNICODE_ID_START = 1 << 1;
     65 const uint8_t UNICODE_ID_CONTINUE_ONLY = 1 << 2;
     66 const uint8_t UNICODE_ID_CONTINUE = UNICODE_ID_START + UNICODE_ID_CONTINUE_ONLY;
     67 }  // namespace CharFlag
     68 
     69 constexpr char16_t NO_BREAK_SPACE = 0x00A0;
     70 constexpr char16_t MICRO_SIGN = 0x00B5;
     71 constexpr char16_t LATIN_SMALL_LETTER_SHARP_S = 0x00DF;
     72 constexpr char16_t LATIN_SMALL_LETTER_A_WITH_GRAVE = 0x00E0;
     73 constexpr char16_t DIVISION_SIGN = 0x00F7;
     74 constexpr char16_t LATIN_SMALL_LETTER_Y_WITH_DIAERESIS = 0x00FF;
     75 constexpr char16_t LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE = 0x0130;
     76 constexpr char16_t COMBINING_DOT_ABOVE = 0x0307;
     77 constexpr char16_t GREEK_CAPITAL_LETTER_SIGMA = 0x03A3;
     78 constexpr char16_t GREEK_SMALL_LETTER_FINAL_SIGMA = 0x03C2;
     79 constexpr char16_t GREEK_SMALL_LETTER_SIGMA = 0x03C3;
     80 constexpr char16_t LINE_SEPARATOR = 0x2028;
     81 constexpr char16_t PARA_SEPARATOR = 0x2029;
     82 constexpr char16_t REPLACEMENT_CHARACTER = 0xFFFD;
     83 
     84 const char16_t LeadSurrogateMin = 0xD800;
     85 const char16_t LeadSurrogateMax = 0xDBFF;
     86 const char16_t TrailSurrogateMin = 0xDC00;
     87 const char16_t TrailSurrogateMax = 0xDFFF;
     88 
     89 const char32_t UTF16Max = 0xFFFF;
     90 const char32_t NonBMPMin = 0x10000;
     91 const char32_t NonBMPMax = 0x10FFFF;
     92 
     93 class CharacterInfo {
     94  /*
     95   * upperCase and lowerCase normally store the delta between two
     96   * letters. For example the lower case alpha (a) has the char code
     97   * 97, and the upper case alpha (A) has 65. So for "a" we would
     98   * store -32 in upperCase (97 + (-32) = 65) and 0 in lowerCase,
     99   * because this char is already in lower case.
    100   * Well, not -32 exactly, but (2**16 - 32) to induce
    101   * unsigned overflow with identical mathematical behavior.
    102   * For upper case alpha, we would store 0 in upperCase and 32 in
    103   * lowerCase (65 + 32 = 97).
    104   *
    105   * We use deltas to reuse information for multiple characters. For
    106   * example the whole lower case latin alphabet fits into one entry,
    107   * because it's always a UnicodeLetter and upperCase contains
    108   * -32.
    109   */
    110 public:
    111  uint16_t upperCase;
    112  uint16_t lowerCase;
    113  uint8_t flags;
    114 
    115  inline bool isSpace() const { return flags & CharFlag::SPACE; }
    116 
    117  inline bool isUnicodeIDStart() const {
    118    return flags & CharFlag::UNICODE_ID_START;
    119  }
    120 
    121  inline bool isUnicodeIDContinue() const {
    122    // Also matches <ZWNJ> and <ZWJ>!
    123    return flags & CharFlag::UNICODE_ID_CONTINUE;
    124  }
    125 };
    126 
    127 extern const uint8_t index1[];
    128 extern const uint8_t index2[];
    129 extern const CharacterInfo js_charinfo[];
    130 
    131 constexpr size_t CharInfoShift = 6;
    132 
    133 inline const CharacterInfo& CharInfo(char16_t code) {
    134  const size_t shift = CharInfoShift;
    135  size_t index = index1[code >> shift];
    136  index = index2[(index << shift) + (code & ((1 << shift) - 1))];
    137 
    138  return js_charinfo[index];
    139 }
    140 
    141 inline bool IsIdentifierStart(char16_t ch) {
    142  /*
    143   * ES2016 11.6 IdentifierStart
    144   *  $ (dollar sign)
    145   *  _ (underscore)
    146   *  or any character with the Unicode property «ID_Start».
    147   *
    148   * We use a lookup table for small and thus common characters for speed.
    149   */
    150 
    151  if (ch < 128) {
    152    return js_isidstart[ch];
    153  }
    154 
    155  return CharInfo(ch).isUnicodeIDStart();
    156 }
    157 
    158 inline bool IsIdentifierStartASCII(char ch) {
    159  MOZ_ASSERT(uint8_t(ch) < 128);
    160  return js_isidstart[uint8_t(ch)];
    161 }
    162 
    163 bool IsIdentifierStartNonBMP(char32_t codePoint);
    164 
    165 inline bool IsIdentifierStart(char32_t codePoint) {
    166  if (MOZ_UNLIKELY(codePoint > UTF16Max)) {
    167    return IsIdentifierStartNonBMP(codePoint);
    168  }
    169  return IsIdentifierStart(char16_t(codePoint));
    170 }
    171 
    172 inline bool IsIdentifierPart(char16_t ch) {
    173  /*
    174   * ES2016 11.6 IdentifierPart
    175   *  $ (dollar sign)
    176   *  _ (underscore)
    177   *  <ZWNJ>
    178   *  <ZWJ>
    179   *  or any character with the Unicode property «ID_Continue».
    180   *
    181   * We use a lookup table for small and thus common characters for speed.
    182   */
    183 
    184  if (ch < 128) {
    185    return js_isident[ch];
    186  }
    187 
    188  return CharInfo(ch).isUnicodeIDContinue();
    189 }
    190 
    191 inline bool IsIdentifierPartASCII(char ch) {
    192  MOZ_ASSERT(uint8_t(ch) < 128);
    193  return js_isident[uint8_t(ch)];
    194 }
    195 
    196 bool IsIdentifierPartNonBMP(char32_t codePoint);
    197 
    198 inline bool IsIdentifierPart(char32_t codePoint) {
    199  if (MOZ_UNLIKELY(codePoint > UTF16Max)) {
    200    return IsIdentifierPartNonBMP(codePoint);
    201  }
    202  return IsIdentifierPart(char16_t(codePoint));
    203 }
    204 
    205 inline bool IsUnicodeIDStart(char16_t ch) {
    206  return CharInfo(ch).isUnicodeIDStart();
    207 }
    208 
    209 bool IsUnicodeIDStartNonBMP(char32_t codePoint);
    210 
    211 inline bool IsUnicodeIDStart(char32_t codePoint) {
    212  if (MOZ_UNLIKELY(codePoint > UTF16Max)) {
    213    return IsIdentifierStartNonBMP(codePoint);
    214  }
    215  return IsUnicodeIDStart(char16_t(codePoint));
    216 }
    217 
    218 // IsSpace checks if a code point is included in the merged set of WhiteSpace
    219 // and LineTerminator specified by #sec-white-space and #sec-line-terminators.
    220 // We combine them because nearly every calling function wants this, excepting
    221 // only some tokenizer code that necessarily handles LineTerminator specially
    222 // due to UTF-8/UTF-16 template specialization.
    223 inline bool IsSpace(char16_t ch) {
    224  // ASCII code points are very common and must be handled quickly, so use a
    225  // lookup table for them.
    226  if (ch < 128) {
    227    return js_isspace[ch];
    228  }
    229 
    230  // NO-BREAK SPACE is supposed to be the most common non-ASCII WhiteSpace code
    231  // point, so inline its handling too.
    232  if (ch == NO_BREAK_SPACE) {
    233    return true;
    234  }
    235 
    236  return CharInfo(ch).isSpace();
    237 }
    238 
    239 inline bool IsSpace(JS::Latin1Char ch) {
    240  if (ch < 128) {
    241    return js_isspace[ch];
    242  }
    243 
    244  if (ch == NO_BREAK_SPACE) {
    245    return true;
    246  }
    247 
    248  MOZ_ASSERT(!CharInfo(ch).isSpace());
    249  return false;
    250 }
    251 
    252 inline bool IsSpace(char ch) {
    253  return IsSpace(static_cast<JS::Latin1Char>(ch));
    254 }
    255 
    256 // IsSpace(char32_t) must additionally exclude everything non-BMP.
    257 inline bool IsSpace(char32_t ch) {
    258  if (ch < 128) {
    259    return js_isspace[ch];
    260  }
    261 
    262  if (ch == NO_BREAK_SPACE) {
    263    return true;
    264  }
    265 
    266  // An assertion in make_unicode.py:make_unicode_file guarantees that there are
    267  // no Space_Separator (Zs) code points outside the BMP.
    268  if (ch >= NonBMPMin) {
    269    return false;
    270  }
    271 
    272  return CharInfo(mozilla::AssertedCast<char16_t>(ch)).isSpace();
    273 }
    274 
    275 /*
    276 * Returns the simple upper case mapping (possibly the identity mapping; see
    277 * ChangesWhenUpperCasedSpecialCasing for details) of the given UTF-16 code
    278 * unit.
    279 */
    280 inline char16_t ToUpperCase(char16_t ch) {
    281  if (ch < 128) {
    282    if (ch >= 'a' && ch <= 'z') {
    283      return ch - ('a' - 'A');
    284    }
    285    return ch;
    286  }
    287 
    288  const CharacterInfo& info = CharInfo(ch);
    289 
    290  return uint16_t(ch) + info.upperCase;
    291 }
    292 
    293 /*
    294 * Returns the simple lower case mapping (possibly the identity mapping; see
    295 * ChangesWhenUpperCasedSpecialCasing for details) of the given UTF-16 code
    296 * unit.
    297 */
    298 inline char16_t ToLowerCase(char16_t ch) {
    299  if (ch < 128) {
    300    if (ch >= 'A' && ch <= 'Z') {
    301      return ch + ('a' - 'A');
    302    }
    303    return ch;
    304  }
    305 
    306  const CharacterInfo& info = CharInfo(ch);
    307 
    308  return uint16_t(ch) + info.lowerCase;
    309 }
    310 
    311 extern const JS::Latin1Char latin1ToLowerCaseTable[];
    312 
    313 /*
    314 * Returns the simple lower case mapping (possibly the identity mapping; see
    315 * ChangesWhenUpperCasedSpecialCasing for details) of the given Latin-1 code
    316 * point.
    317 */
    318 inline JS::Latin1Char ToLowerCase(JS::Latin1Char ch) {
    319  return latin1ToLowerCaseTable[ch];
    320 }
    321 
    322 /*
    323 * Returns the simple lower case mapping (possibly the identity mapping; see
    324 * ChangesWhenUpperCasedSpecialCasing for details) of the given ASCII code
    325 * point.
    326 */
    327 inline char ToLowerCase(char ch) {
    328  MOZ_ASSERT(static_cast<unsigned char>(ch) < 128);
    329  return latin1ToLowerCaseTable[uint8_t(ch)];
    330 }
    331 
    332 /**
    333 * Returns true iff ToUpperCase(ch) != ch.
    334 *
    335 * This function isn't guaranteed to correctly handle code points for which
    336 * |ChangesWhenUpperCasedSpecialCasing| returns true, so it is *not* always the
    337 * same as the value of the Changes_When_Uppercased Unicode property value for
    338 * the code point.
    339 */
    340 inline bool ChangesWhenUpperCased(char16_t ch) {
    341  if (ch < 128) {
    342    return ch >= 'a' && ch <= 'z';
    343  }
    344  return CharInfo(ch).upperCase != 0;
    345 }
    346 
    347 /**
    348 * Returns true iff ToUpperCase(ch) != ch.
    349 *
    350 * This function isn't guaranteed to correctly handle code points for which
    351 * |ChangesWhenUpperCasedSpecialCasing| returns true, so it is *not* always the
    352 * same as the value of the Changes_When_Uppercased Unicode property value for
    353 * the code point.
    354 */
    355 inline bool ChangesWhenUpperCased(JS::Latin1Char ch) {
    356  if (MOZ_LIKELY(ch < 128)) {
    357    return ch >= 'a' && ch <= 'z';
    358  }
    359 
    360  // U+00B5 and U+00E0 to U+00FF, except U+00F7, have an uppercase form.
    361  bool hasUpper =
    362      ch == MICRO_SIGN || (((ch & ~0x1F) == LATIN_SMALL_LETTER_A_WITH_GRAVE) &&
    363                           ch != DIVISION_SIGN);
    364  MOZ_ASSERT(hasUpper == ChangesWhenUpperCased(char16_t(ch)));
    365  return hasUpper;
    366 }
    367 
    368 // Returns true iff ToLowerCase(ch) != ch.
    369 inline bool ChangesWhenLowerCased(char16_t ch) {
    370  if (ch < 128) {
    371    return ch >= 'A' && ch <= 'Z';
    372  }
    373  return CharInfo(ch).lowerCase != 0;
    374 }
    375 
    376 // Returns true iff ToLowerCase(ch) != ch.
    377 inline bool ChangesWhenLowerCased(JS::Latin1Char ch) {
    378  return latin1ToLowerCaseTable[ch] != ch;
    379 }
    380 
    381 #define CHECK_RANGE(FROM, TO, LEAD, TRAIL_FROM, TRAIL_TO, DIFF) \
    382  if (lead == LEAD && trail >= TRAIL_FROM && trail <= TRAIL_TO) return true;
    383 
    384 inline bool ChangesWhenUpperCasedNonBMP(char16_t lead, char16_t trail) {
    385  FOR_EACH_NON_BMP_UPPERCASE(CHECK_RANGE)
    386  return false;
    387 }
    388 
    389 inline bool ChangesWhenLowerCasedNonBMP(char16_t lead, char16_t trail) {
    390  FOR_EACH_NON_BMP_LOWERCASE(CHECK_RANGE)
    391  return false;
    392 }
    393 
    394 #undef CHECK_RANGE
    395 
    396 inline char16_t ToUpperCaseNonBMPTrail(char16_t lead, char16_t trail) {
    397 #define CALC_TRAIL(FROM, TO, LEAD, TRAIL_FROM, TRAIL_TO, DIFF)  \
    398  if (lead == LEAD && trail >= TRAIL_FROM && trail <= TRAIL_TO) \
    399    return trail + DIFF;
    400  FOR_EACH_NON_BMP_UPPERCASE(CALC_TRAIL)
    401 #undef CALL_TRAIL
    402 
    403  return trail;
    404 }
    405 
    406 inline char16_t ToLowerCaseNonBMPTrail(char16_t lead, char16_t trail) {
    407 #define CALC_TRAIL(FROM, TO, LEAD, TRAIL_FROM, TRAIL_TO, DIFF)  \
    408  if (lead == LEAD && trail >= TRAIL_FROM && trail <= TRAIL_TO) \
    409    return trail + DIFF;
    410  FOR_EACH_NON_BMP_LOWERCASE(CALC_TRAIL)
    411 #undef CALL_TRAIL
    412 
    413  return trail;
    414 }
    415 
    416 /*
    417 * Returns true if, independent of language/locale, the given UTF-16 code unit
    418 * has a special upper case mapping.
    419 *
    420 * Unicode defines two case mapping modes:
    421 *
    422 *   1. "simple case mappings" (defined in UnicodeData.txt) for one-to-one
    423 *      mappings that are always the same regardless of locale or context
    424 *      within a string (e.g. "a"→"A").
    425 *   2. "special case mappings" (defined in SpecialCasing.txt) for mappings
    426 *      that alter string length (e.g. uppercasing "ß"→"SS") or where different
    427 *      mappings occur depending on language/locale (e.g. uppercasing "i"→"I"
    428 *      usually but "i"→"İ" in Turkish) or context within the string (e.g.
    429 *      lowercasing "Σ" U+03A3 GREEK CAPITAL LETTER SIGMA to "ς" U+03C2 GREEK
    430 *      SMALL LETTER FINAL SIGMA when the sigma appears [roughly speaking] at
    431 *      the end of a word but "ς" U+03C3 GREEK SMALL LETTER SIGMA anywhere
    432 *      else).
    433 *
    434 * The ChangesWhenUpperCased*() functions defined above will return true for
    435 * code points that have simple case mappings, but they may not return the
    436 * right result for code points that have special case mappings.  To correctly
    437 * support full case mappings for all code points, callers must determine
    438 * whether this function returns true or false for the code point, then use
    439 * AppendUpperCaseSpecialCasing in the former case and ToUpperCase in the
    440 * latter.
    441 *
    442 * NOTE: All special upper case mappings are unconditional (that is, they don't
    443 *       depend on language/locale or context within the string) in Unicode 10.
    444 */
    445 bool ChangesWhenUpperCasedSpecialCasing(char16_t ch);
    446 
    447 /*
    448 * Returns the length of the upper case mapping of |ch|.
    449 *
    450 * This function asserts if |ch| doesn't have a special upper case mapping.
    451 */
    452 size_t LengthUpperCaseSpecialCasing(char16_t ch);
    453 
    454 /*
    455 * Appends the upper case mapping of |ch| to the given output buffer,
    456 * starting at the provided index.
    457 *
    458 * This function asserts if |ch| doesn't have a special upper case mapping.
    459 */
    460 void AppendUpperCaseSpecialCasing(char16_t ch, char16_t* elements,
    461                                  size_t* index);
    462 
    463 class FoldingInfo {
    464 public:
    465  uint16_t folding;
    466 };
    467 
    468 extern const uint8_t folding_index1[];
    469 extern const uint8_t folding_index2[];
    470 extern const FoldingInfo js_foldinfo[];
    471 
    472 inline const FoldingInfo& CaseFoldInfo(char16_t code) {
    473  const size_t shift = 5;
    474  size_t index = folding_index1[code >> shift];
    475  index = folding_index2[(index << shift) + (code & ((1 << shift) - 1))];
    476  return js_foldinfo[index];
    477 }
    478 
    479 inline char16_t FoldCase(char16_t ch) {
    480  const FoldingInfo& info = CaseFoldInfo(ch);
    481  return uint16_t(ch) + info.folding;
    482 }
    483 
    484 inline bool IsSupplementary(char32_t codePoint) {
    485  return codePoint >= NonBMPMin && codePoint <= NonBMPMax;
    486 }
    487 
    488 inline bool IsLeadSurrogate(char32_t codePoint) {
    489  return codePoint >= LeadSurrogateMin && codePoint <= LeadSurrogateMax;
    490 }
    491 
    492 inline bool IsTrailSurrogate(char32_t codePoint) {
    493  return codePoint >= TrailSurrogateMin && codePoint <= TrailSurrogateMax;
    494 }
    495 
    496 /**
    497 * True iff the given value is a UTF-16 surrogate.
    498 *
    499 * This function is intended for use in contexts where 32-bit values may need
    500 * to be tested to see if they reside in the surrogate range, so it doesn't
    501 * just take char16_t.
    502 */
    503 inline bool IsSurrogate(char32_t codePoint) {
    504  return LeadSurrogateMin <= codePoint && codePoint <= TrailSurrogateMax;
    505 }
    506 
    507 inline char16_t LeadSurrogate(char32_t codePoint) {
    508  MOZ_ASSERT(IsSupplementary(codePoint));
    509 
    510  return char16_t((codePoint >> 10) + (LeadSurrogateMin - (NonBMPMin >> 10)));
    511 }
    512 
    513 inline char16_t TrailSurrogate(char32_t codePoint) {
    514  MOZ_ASSERT(IsSupplementary(codePoint));
    515 
    516  return char16_t((codePoint & 0x3FF) | TrailSurrogateMin);
    517 }
    518 
    519 inline void UTF16Encode(char32_t codePoint, char16_t* lead, char16_t* trail) {
    520  MOZ_ASSERT(IsSupplementary(codePoint));
    521 
    522  *lead = LeadSurrogate(codePoint);
    523  *trail = TrailSurrogate(codePoint);
    524 }
    525 
    526 inline void UTF16Encode(char32_t codePoint, char16_t* elements,
    527                        unsigned* index) {
    528  if (!IsSupplementary(codePoint)) {
    529    elements[(*index)++] = char16_t(codePoint);
    530  } else {
    531    elements[(*index)++] = LeadSurrogate(codePoint);
    532    elements[(*index)++] = TrailSurrogate(codePoint);
    533  }
    534 }
    535 
    536 inline char32_t UTF16Decode(char16_t lead, char16_t trail) {
    537  MOZ_ASSERT(IsLeadSurrogate(lead));
    538  MOZ_ASSERT(IsTrailSurrogate(trail));
    539 
    540  return (lead << 10) + trail +
    541         (NonBMPMin - (LeadSurrogateMin << 10) - TrailSurrogateMin);
    542 }
    543 
    544 } /* namespace unicode */
    545 } /* namespace js */
    546 
    547 #endif /* util_Unicode_h */