Unicode.h (16658B)
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- 2 * vim: set ts=8 sts=2 et sw=2 tw=80: 3 * This Source Code Form is subject to the terms of the Mozilla Public 4 * License, v. 2.0. If a copy of the MPL was not distributed with this 5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 6 7 #ifndef util_Unicode_h 8 #define util_Unicode_h 9 10 #include "mozilla/Casting.h" // mozilla::AssertedCast 11 12 #include "jspubtd.h" 13 14 #include "util/UnicodeNonBMP.h" 15 16 namespace js { 17 namespace unicode { 18 19 extern const bool js_isidstart[]; 20 extern const bool js_isident[]; 21 extern const bool js_isspace[]; 22 23 /* 24 * This namespace contains all the knowledge required to handle Unicode 25 * characters in JavaScript. 26 * 27 * SPACE 28 * Every character that is either in the ECMAScript class WhiteSpace 29 * (ES2016, § 11.2) or in LineTerminator (ES2016, § 11.3). 30 * 31 * WhiteSpace 32 * \u0009, \u000B, \u000C, \u0020, \u00A0 and \uFEFF 33 * and every other Unicode character with the General Category "Zs". 34 * See <http://www.unicode.org/reports/tr44/#UnicodeData.txt> for more 35 * information about General Categories and the UnicodeData.txt file. 36 * 37 * LineTerminator 38 * \u000A, \u000D, \u2028, \u2029 39 * 40 * UNICODE_ID_START 41 * These are all characters with the Unicode property «ID_Start». 42 * 43 * UNICODE_ID_CONTINUE_ONLY 44 * These are all characters with the Unicode property «ID_Continue» minus all 45 * characters with the Unicode property «ID_Start». 46 * And additionally <ZWNJ> and <ZWJ>. (ES2016, § 11.6) 47 * 48 * UNICODE_ID_CONTINUE 49 * These are all characters with the Unicode property «ID_Continue». 50 * And additionally <ZWNJ> and <ZWJ>. (ES2016, § 11.6) 51 * 52 * Attention: UNICODE_ID_START is _not_ IdentifierStart, but you could build 53 * a matcher for the real IdentifierPart like this: 54 * 55 * if char in ['$', '_']: 56 * return True 57 * if GetFlag(char) & UNICODE_ID_CONTINUE: 58 * return True 59 * 60 */ 61 62 namespace CharFlag { 63 const uint8_t SPACE = 1 << 0; 64 const uint8_t UNICODE_ID_START = 1 << 1; 65 const uint8_t UNICODE_ID_CONTINUE_ONLY = 1 << 2; 66 const uint8_t UNICODE_ID_CONTINUE = UNICODE_ID_START + UNICODE_ID_CONTINUE_ONLY; 67 } // namespace CharFlag 68 69 constexpr char16_t NO_BREAK_SPACE = 0x00A0; 70 constexpr char16_t MICRO_SIGN = 0x00B5; 71 constexpr char16_t LATIN_SMALL_LETTER_SHARP_S = 0x00DF; 72 constexpr char16_t LATIN_SMALL_LETTER_A_WITH_GRAVE = 0x00E0; 73 constexpr char16_t DIVISION_SIGN = 0x00F7; 74 constexpr char16_t LATIN_SMALL_LETTER_Y_WITH_DIAERESIS = 0x00FF; 75 constexpr char16_t LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE = 0x0130; 76 constexpr char16_t COMBINING_DOT_ABOVE = 0x0307; 77 constexpr char16_t GREEK_CAPITAL_LETTER_SIGMA = 0x03A3; 78 constexpr char16_t GREEK_SMALL_LETTER_FINAL_SIGMA = 0x03C2; 79 constexpr char16_t GREEK_SMALL_LETTER_SIGMA = 0x03C3; 80 constexpr char16_t LINE_SEPARATOR = 0x2028; 81 constexpr char16_t PARA_SEPARATOR = 0x2029; 82 constexpr char16_t REPLACEMENT_CHARACTER = 0xFFFD; 83 84 const char16_t LeadSurrogateMin = 0xD800; 85 const char16_t LeadSurrogateMax = 0xDBFF; 86 const char16_t TrailSurrogateMin = 0xDC00; 87 const char16_t TrailSurrogateMax = 0xDFFF; 88 89 const char32_t UTF16Max = 0xFFFF; 90 const char32_t NonBMPMin = 0x10000; 91 const char32_t NonBMPMax = 0x10FFFF; 92 93 class CharacterInfo { 94 /* 95 * upperCase and lowerCase normally store the delta between two 96 * letters. For example the lower case alpha (a) has the char code 97 * 97, and the upper case alpha (A) has 65. So for "a" we would 98 * store -32 in upperCase (97 + (-32) = 65) and 0 in lowerCase, 99 * because this char is already in lower case. 100 * Well, not -32 exactly, but (2**16 - 32) to induce 101 * unsigned overflow with identical mathematical behavior. 102 * For upper case alpha, we would store 0 in upperCase and 32 in 103 * lowerCase (65 + 32 = 97). 104 * 105 * We use deltas to reuse information for multiple characters. For 106 * example the whole lower case latin alphabet fits into one entry, 107 * because it's always a UnicodeLetter and upperCase contains 108 * -32. 109 */ 110 public: 111 uint16_t upperCase; 112 uint16_t lowerCase; 113 uint8_t flags; 114 115 inline bool isSpace() const { return flags & CharFlag::SPACE; } 116 117 inline bool isUnicodeIDStart() const { 118 return flags & CharFlag::UNICODE_ID_START; 119 } 120 121 inline bool isUnicodeIDContinue() const { 122 // Also matches <ZWNJ> and <ZWJ>! 123 return flags & CharFlag::UNICODE_ID_CONTINUE; 124 } 125 }; 126 127 extern const uint8_t index1[]; 128 extern const uint8_t index2[]; 129 extern const CharacterInfo js_charinfo[]; 130 131 constexpr size_t CharInfoShift = 6; 132 133 inline const CharacterInfo& CharInfo(char16_t code) { 134 const size_t shift = CharInfoShift; 135 size_t index = index1[code >> shift]; 136 index = index2[(index << shift) + (code & ((1 << shift) - 1))]; 137 138 return js_charinfo[index]; 139 } 140 141 inline bool IsIdentifierStart(char16_t ch) { 142 /* 143 * ES2016 11.6 IdentifierStart 144 * $ (dollar sign) 145 * _ (underscore) 146 * or any character with the Unicode property «ID_Start». 147 * 148 * We use a lookup table for small and thus common characters for speed. 149 */ 150 151 if (ch < 128) { 152 return js_isidstart[ch]; 153 } 154 155 return CharInfo(ch).isUnicodeIDStart(); 156 } 157 158 inline bool IsIdentifierStartASCII(char ch) { 159 MOZ_ASSERT(uint8_t(ch) < 128); 160 return js_isidstart[uint8_t(ch)]; 161 } 162 163 bool IsIdentifierStartNonBMP(char32_t codePoint); 164 165 inline bool IsIdentifierStart(char32_t codePoint) { 166 if (MOZ_UNLIKELY(codePoint > UTF16Max)) { 167 return IsIdentifierStartNonBMP(codePoint); 168 } 169 return IsIdentifierStart(char16_t(codePoint)); 170 } 171 172 inline bool IsIdentifierPart(char16_t ch) { 173 /* 174 * ES2016 11.6 IdentifierPart 175 * $ (dollar sign) 176 * _ (underscore) 177 * <ZWNJ> 178 * <ZWJ> 179 * or any character with the Unicode property «ID_Continue». 180 * 181 * We use a lookup table for small and thus common characters for speed. 182 */ 183 184 if (ch < 128) { 185 return js_isident[ch]; 186 } 187 188 return CharInfo(ch).isUnicodeIDContinue(); 189 } 190 191 inline bool IsIdentifierPartASCII(char ch) { 192 MOZ_ASSERT(uint8_t(ch) < 128); 193 return js_isident[uint8_t(ch)]; 194 } 195 196 bool IsIdentifierPartNonBMP(char32_t codePoint); 197 198 inline bool IsIdentifierPart(char32_t codePoint) { 199 if (MOZ_UNLIKELY(codePoint > UTF16Max)) { 200 return IsIdentifierPartNonBMP(codePoint); 201 } 202 return IsIdentifierPart(char16_t(codePoint)); 203 } 204 205 inline bool IsUnicodeIDStart(char16_t ch) { 206 return CharInfo(ch).isUnicodeIDStart(); 207 } 208 209 bool IsUnicodeIDStartNonBMP(char32_t codePoint); 210 211 inline bool IsUnicodeIDStart(char32_t codePoint) { 212 if (MOZ_UNLIKELY(codePoint > UTF16Max)) { 213 return IsIdentifierStartNonBMP(codePoint); 214 } 215 return IsUnicodeIDStart(char16_t(codePoint)); 216 } 217 218 // IsSpace checks if a code point is included in the merged set of WhiteSpace 219 // and LineTerminator specified by #sec-white-space and #sec-line-terminators. 220 // We combine them because nearly every calling function wants this, excepting 221 // only some tokenizer code that necessarily handles LineTerminator specially 222 // due to UTF-8/UTF-16 template specialization. 223 inline bool IsSpace(char16_t ch) { 224 // ASCII code points are very common and must be handled quickly, so use a 225 // lookup table for them. 226 if (ch < 128) { 227 return js_isspace[ch]; 228 } 229 230 // NO-BREAK SPACE is supposed to be the most common non-ASCII WhiteSpace code 231 // point, so inline its handling too. 232 if (ch == NO_BREAK_SPACE) { 233 return true; 234 } 235 236 return CharInfo(ch).isSpace(); 237 } 238 239 inline bool IsSpace(JS::Latin1Char ch) { 240 if (ch < 128) { 241 return js_isspace[ch]; 242 } 243 244 if (ch == NO_BREAK_SPACE) { 245 return true; 246 } 247 248 MOZ_ASSERT(!CharInfo(ch).isSpace()); 249 return false; 250 } 251 252 inline bool IsSpace(char ch) { 253 return IsSpace(static_cast<JS::Latin1Char>(ch)); 254 } 255 256 // IsSpace(char32_t) must additionally exclude everything non-BMP. 257 inline bool IsSpace(char32_t ch) { 258 if (ch < 128) { 259 return js_isspace[ch]; 260 } 261 262 if (ch == NO_BREAK_SPACE) { 263 return true; 264 } 265 266 // An assertion in make_unicode.py:make_unicode_file guarantees that there are 267 // no Space_Separator (Zs) code points outside the BMP. 268 if (ch >= NonBMPMin) { 269 return false; 270 } 271 272 return CharInfo(mozilla::AssertedCast<char16_t>(ch)).isSpace(); 273 } 274 275 /* 276 * Returns the simple upper case mapping (possibly the identity mapping; see 277 * ChangesWhenUpperCasedSpecialCasing for details) of the given UTF-16 code 278 * unit. 279 */ 280 inline char16_t ToUpperCase(char16_t ch) { 281 if (ch < 128) { 282 if (ch >= 'a' && ch <= 'z') { 283 return ch - ('a' - 'A'); 284 } 285 return ch; 286 } 287 288 const CharacterInfo& info = CharInfo(ch); 289 290 return uint16_t(ch) + info.upperCase; 291 } 292 293 /* 294 * Returns the simple lower case mapping (possibly the identity mapping; see 295 * ChangesWhenUpperCasedSpecialCasing for details) of the given UTF-16 code 296 * unit. 297 */ 298 inline char16_t ToLowerCase(char16_t ch) { 299 if (ch < 128) { 300 if (ch >= 'A' && ch <= 'Z') { 301 return ch + ('a' - 'A'); 302 } 303 return ch; 304 } 305 306 const CharacterInfo& info = CharInfo(ch); 307 308 return uint16_t(ch) + info.lowerCase; 309 } 310 311 extern const JS::Latin1Char latin1ToLowerCaseTable[]; 312 313 /* 314 * Returns the simple lower case mapping (possibly the identity mapping; see 315 * ChangesWhenUpperCasedSpecialCasing for details) of the given Latin-1 code 316 * point. 317 */ 318 inline JS::Latin1Char ToLowerCase(JS::Latin1Char ch) { 319 return latin1ToLowerCaseTable[ch]; 320 } 321 322 /* 323 * Returns the simple lower case mapping (possibly the identity mapping; see 324 * ChangesWhenUpperCasedSpecialCasing for details) of the given ASCII code 325 * point. 326 */ 327 inline char ToLowerCase(char ch) { 328 MOZ_ASSERT(static_cast<unsigned char>(ch) < 128); 329 return latin1ToLowerCaseTable[uint8_t(ch)]; 330 } 331 332 /** 333 * Returns true iff ToUpperCase(ch) != ch. 334 * 335 * This function isn't guaranteed to correctly handle code points for which 336 * |ChangesWhenUpperCasedSpecialCasing| returns true, so it is *not* always the 337 * same as the value of the Changes_When_Uppercased Unicode property value for 338 * the code point. 339 */ 340 inline bool ChangesWhenUpperCased(char16_t ch) { 341 if (ch < 128) { 342 return ch >= 'a' && ch <= 'z'; 343 } 344 return CharInfo(ch).upperCase != 0; 345 } 346 347 /** 348 * Returns true iff ToUpperCase(ch) != ch. 349 * 350 * This function isn't guaranteed to correctly handle code points for which 351 * |ChangesWhenUpperCasedSpecialCasing| returns true, so it is *not* always the 352 * same as the value of the Changes_When_Uppercased Unicode property value for 353 * the code point. 354 */ 355 inline bool ChangesWhenUpperCased(JS::Latin1Char ch) { 356 if (MOZ_LIKELY(ch < 128)) { 357 return ch >= 'a' && ch <= 'z'; 358 } 359 360 // U+00B5 and U+00E0 to U+00FF, except U+00F7, have an uppercase form. 361 bool hasUpper = 362 ch == MICRO_SIGN || (((ch & ~0x1F) == LATIN_SMALL_LETTER_A_WITH_GRAVE) && 363 ch != DIVISION_SIGN); 364 MOZ_ASSERT(hasUpper == ChangesWhenUpperCased(char16_t(ch))); 365 return hasUpper; 366 } 367 368 // Returns true iff ToLowerCase(ch) != ch. 369 inline bool ChangesWhenLowerCased(char16_t ch) { 370 if (ch < 128) { 371 return ch >= 'A' && ch <= 'Z'; 372 } 373 return CharInfo(ch).lowerCase != 0; 374 } 375 376 // Returns true iff ToLowerCase(ch) != ch. 377 inline bool ChangesWhenLowerCased(JS::Latin1Char ch) { 378 return latin1ToLowerCaseTable[ch] != ch; 379 } 380 381 #define CHECK_RANGE(FROM, TO, LEAD, TRAIL_FROM, TRAIL_TO, DIFF) \ 382 if (lead == LEAD && trail >= TRAIL_FROM && trail <= TRAIL_TO) return true; 383 384 inline bool ChangesWhenUpperCasedNonBMP(char16_t lead, char16_t trail) { 385 FOR_EACH_NON_BMP_UPPERCASE(CHECK_RANGE) 386 return false; 387 } 388 389 inline bool ChangesWhenLowerCasedNonBMP(char16_t lead, char16_t trail) { 390 FOR_EACH_NON_BMP_LOWERCASE(CHECK_RANGE) 391 return false; 392 } 393 394 #undef CHECK_RANGE 395 396 inline char16_t ToUpperCaseNonBMPTrail(char16_t lead, char16_t trail) { 397 #define CALC_TRAIL(FROM, TO, LEAD, TRAIL_FROM, TRAIL_TO, DIFF) \ 398 if (lead == LEAD && trail >= TRAIL_FROM && trail <= TRAIL_TO) \ 399 return trail + DIFF; 400 FOR_EACH_NON_BMP_UPPERCASE(CALC_TRAIL) 401 #undef CALL_TRAIL 402 403 return trail; 404 } 405 406 inline char16_t ToLowerCaseNonBMPTrail(char16_t lead, char16_t trail) { 407 #define CALC_TRAIL(FROM, TO, LEAD, TRAIL_FROM, TRAIL_TO, DIFF) \ 408 if (lead == LEAD && trail >= TRAIL_FROM && trail <= TRAIL_TO) \ 409 return trail + DIFF; 410 FOR_EACH_NON_BMP_LOWERCASE(CALC_TRAIL) 411 #undef CALL_TRAIL 412 413 return trail; 414 } 415 416 /* 417 * Returns true if, independent of language/locale, the given UTF-16 code unit 418 * has a special upper case mapping. 419 * 420 * Unicode defines two case mapping modes: 421 * 422 * 1. "simple case mappings" (defined in UnicodeData.txt) for one-to-one 423 * mappings that are always the same regardless of locale or context 424 * within a string (e.g. "a"→"A"). 425 * 2. "special case mappings" (defined in SpecialCasing.txt) for mappings 426 * that alter string length (e.g. uppercasing "ß"→"SS") or where different 427 * mappings occur depending on language/locale (e.g. uppercasing "i"→"I" 428 * usually but "i"→"İ" in Turkish) or context within the string (e.g. 429 * lowercasing "Σ" U+03A3 GREEK CAPITAL LETTER SIGMA to "ς" U+03C2 GREEK 430 * SMALL LETTER FINAL SIGMA when the sigma appears [roughly speaking] at 431 * the end of a word but "ς" U+03C3 GREEK SMALL LETTER SIGMA anywhere 432 * else). 433 * 434 * The ChangesWhenUpperCased*() functions defined above will return true for 435 * code points that have simple case mappings, but they may not return the 436 * right result for code points that have special case mappings. To correctly 437 * support full case mappings for all code points, callers must determine 438 * whether this function returns true or false for the code point, then use 439 * AppendUpperCaseSpecialCasing in the former case and ToUpperCase in the 440 * latter. 441 * 442 * NOTE: All special upper case mappings are unconditional (that is, they don't 443 * depend on language/locale or context within the string) in Unicode 10. 444 */ 445 bool ChangesWhenUpperCasedSpecialCasing(char16_t ch); 446 447 /* 448 * Returns the length of the upper case mapping of |ch|. 449 * 450 * This function asserts if |ch| doesn't have a special upper case mapping. 451 */ 452 size_t LengthUpperCaseSpecialCasing(char16_t ch); 453 454 /* 455 * Appends the upper case mapping of |ch| to the given output buffer, 456 * starting at the provided index. 457 * 458 * This function asserts if |ch| doesn't have a special upper case mapping. 459 */ 460 void AppendUpperCaseSpecialCasing(char16_t ch, char16_t* elements, 461 size_t* index); 462 463 class FoldingInfo { 464 public: 465 uint16_t folding; 466 }; 467 468 extern const uint8_t folding_index1[]; 469 extern const uint8_t folding_index2[]; 470 extern const FoldingInfo js_foldinfo[]; 471 472 inline const FoldingInfo& CaseFoldInfo(char16_t code) { 473 const size_t shift = 5; 474 size_t index = folding_index1[code >> shift]; 475 index = folding_index2[(index << shift) + (code & ((1 << shift) - 1))]; 476 return js_foldinfo[index]; 477 } 478 479 inline char16_t FoldCase(char16_t ch) { 480 const FoldingInfo& info = CaseFoldInfo(ch); 481 return uint16_t(ch) + info.folding; 482 } 483 484 inline bool IsSupplementary(char32_t codePoint) { 485 return codePoint >= NonBMPMin && codePoint <= NonBMPMax; 486 } 487 488 inline bool IsLeadSurrogate(char32_t codePoint) { 489 return codePoint >= LeadSurrogateMin && codePoint <= LeadSurrogateMax; 490 } 491 492 inline bool IsTrailSurrogate(char32_t codePoint) { 493 return codePoint >= TrailSurrogateMin && codePoint <= TrailSurrogateMax; 494 } 495 496 /** 497 * True iff the given value is a UTF-16 surrogate. 498 * 499 * This function is intended for use in contexts where 32-bit values may need 500 * to be tested to see if they reside in the surrogate range, so it doesn't 501 * just take char16_t. 502 */ 503 inline bool IsSurrogate(char32_t codePoint) { 504 return LeadSurrogateMin <= codePoint && codePoint <= TrailSurrogateMax; 505 } 506 507 inline char16_t LeadSurrogate(char32_t codePoint) { 508 MOZ_ASSERT(IsSupplementary(codePoint)); 509 510 return char16_t((codePoint >> 10) + (LeadSurrogateMin - (NonBMPMin >> 10))); 511 } 512 513 inline char16_t TrailSurrogate(char32_t codePoint) { 514 MOZ_ASSERT(IsSupplementary(codePoint)); 515 516 return char16_t((codePoint & 0x3FF) | TrailSurrogateMin); 517 } 518 519 inline void UTF16Encode(char32_t codePoint, char16_t* lead, char16_t* trail) { 520 MOZ_ASSERT(IsSupplementary(codePoint)); 521 522 *lead = LeadSurrogate(codePoint); 523 *trail = TrailSurrogate(codePoint); 524 } 525 526 inline void UTF16Encode(char32_t codePoint, char16_t* elements, 527 unsigned* index) { 528 if (!IsSupplementary(codePoint)) { 529 elements[(*index)++] = char16_t(codePoint); 530 } else { 531 elements[(*index)++] = LeadSurrogate(codePoint); 532 elements[(*index)++] = TrailSurrogate(codePoint); 533 } 534 } 535 536 inline char32_t UTF16Decode(char16_t lead, char16_t trail) { 537 MOZ_ASSERT(IsLeadSurrogate(lead)); 538 MOZ_ASSERT(IsTrailSurrogate(trail)); 539 540 return (lead << 10) + trail + 541 (NonBMPMin - (LeadSurrogateMin << 10) - TrailSurrogateMin); 542 } 543 544 } /* namespace unicode */ 545 } /* namespace js */ 546 547 #endif /* util_Unicode_h */