nsUnicharUtils.cpp (18039B)
1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 /* This Source Code Form is subject to the terms of the Mozilla Public 3 * License, v. 2.0. If a copy of the MPL was not distributed with this 4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 5 6 #include "nsUnicharUtils.h" 7 #include "nsUnicodeProperties.h" 8 #include "nsUTF8Utils.h" 9 #include "mozilla/Likely.h" 10 #include "mozilla/HashFunctions.h" 11 #include "mozilla/intl/UnicodeProperties.h" 12 #include "mozilla/StaticPrefs_layout.h" 13 14 // We map x -> x, except for upper-case letters, 15 // which we map to their lower-case equivalents. 16 static const uint8_t gASCIIToLower[128] = { 17 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 18 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 19 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20, 0x21, 0x22, 0x23, 20 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 21 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 22 0x3c, 0x3d, 0x3e, 0x3f, 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 23 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 24 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, 25 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 26 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 27 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 28 }; 29 30 // We want ToLowerCase(uint32_t) and ToLowerCaseASCII(uint32_t) to be fast 31 // when they're called from within the case-insensitive comparators, so we 32 // define inlined versions. 33 static MOZ_ALWAYS_INLINE uint32_t ToLowerCase_inline(uint32_t aChar) { 34 if (IS_ASCII(aChar)) { 35 return gASCIIToLower[aChar]; 36 } 37 38 return mozilla::intl::UnicodeProperties::ToLower(aChar); 39 } 40 41 static MOZ_ALWAYS_INLINE uint32_t 42 ToLowerCaseASCII_inline(const uint32_t aChar) { 43 if (IS_ASCII(aChar)) { 44 return gASCIIToLower[aChar]; 45 } 46 47 return aChar; 48 } 49 50 void ToLowerCase(nsAString& aString) { 51 char16_t* buf = aString.BeginWriting(); 52 ToLowerCase(buf, buf, aString.Length()); 53 } 54 55 void ToLowerCaseASCII(nsAString& aString) { 56 char16_t* buf = aString.BeginWriting(); 57 ToLowerCaseASCII(buf, buf, aString.Length()); 58 } 59 60 char ToLowerCaseASCII(char aChar) { 61 if (aChar >= 'A' && aChar <= 'Z') { 62 return aChar + 0x20; 63 } 64 return aChar; 65 } 66 67 char16_t ToLowerCaseASCII(char16_t aChar) { 68 if (aChar >= 'A' && aChar <= 'Z') { 69 return aChar + 0x20; 70 } 71 return aChar; 72 } 73 74 char32_t ToLowerCaseASCII(char32_t aChar) { 75 if (aChar >= 'A' && aChar <= 'Z') { 76 return aChar + 0x20; 77 } 78 return aChar; 79 } 80 81 char ToUpperCaseASCII(char aChar) { 82 if (aChar >= 'a' && aChar <= 'z') { 83 return aChar - 0x20; 84 } 85 return aChar; 86 } 87 88 char16_t ToUpperCaseASCII(char16_t aChar) { 89 if (aChar >= 'a' && aChar <= 'z') { 90 return aChar - 0x20; 91 } 92 return aChar; 93 } 94 95 char32_t ToUpperCaseASCII(char32_t aChar) { 96 if (aChar >= 'a' && aChar <= 'z') { 97 return aChar - 0x20; 98 } 99 return aChar; 100 } 101 102 void ToLowerCase(const nsAString& aSource, nsAString& aDest) { 103 const char16_t* in = aSource.BeginReading(); 104 size_t len = aSource.Length(); 105 106 aDest.SetLength(len); 107 char16_t* out = aDest.BeginWriting(); 108 109 ToLowerCase(in, out, len); 110 } 111 112 void ToLowerCaseASCII(const nsAString& aSource, nsAString& aDest) { 113 const char16_t* in = aSource.BeginReading(); 114 size_t len = aSource.Length(); 115 116 aDest.SetLength(len); 117 char16_t* out = aDest.BeginWriting(); 118 119 ToLowerCaseASCII(in, out, len); 120 } 121 122 uint32_t ToLowerCaseASCII(const uint32_t aChar) { 123 return ToLowerCaseASCII_inline(aChar); 124 } 125 126 void ToUpperCase(nsAString& aString) { 127 char16_t* buf = aString.BeginWriting(); 128 ToUpperCase(buf, buf, aString.Length()); 129 } 130 131 void ToUpperCase(const nsAString& aSource, nsAString& aDest) { 132 const char16_t* in = aSource.BeginReading(); 133 size_t len = aSource.Length(); 134 135 aDest.SetLength(len); 136 char16_t* out = aDest.BeginWriting(); 137 138 ToUpperCase(in, out, len); 139 } 140 141 #ifdef MOZILLA_INTERNAL_API 142 143 uint32_t ToFoldedCase(uint32_t aChar) { 144 if (IS_ASCII(aChar)) return gASCIIToLower[aChar]; 145 return mozilla::unicode::GetFoldedcase(aChar); 146 } 147 148 void ToFoldedCase(nsAString& aString) { 149 char16_t* buf = aString.BeginWriting(); 150 ToFoldedCase(buf, buf, aString.Length()); 151 } 152 153 void ToFoldedCase(const char16_t* aIn, char16_t* aOut, size_t aLen) { 154 for (uint32_t i = 0; i < aLen; i++) { 155 uint32_t ch = aIn[i]; 156 if (i < aLen - 1 && NS_IS_SURROGATE_PAIR(ch, aIn[i + 1])) { 157 ch = mozilla::unicode::GetFoldedcase(SURROGATE_TO_UCS4(ch, aIn[i + 1])); 158 NS_ASSERTION(!IS_IN_BMP(ch), "case mapping crossed BMP/SMP boundary!"); 159 aOut[i++] = H_SURROGATE(ch); 160 aOut[i] = L_SURROGATE(ch); 161 continue; 162 } 163 aOut[i] = ToFoldedCase(ch); 164 } 165 } 166 167 uint32_t ToNaked(uint32_t aChar) { 168 if (IS_ASCII(aChar)) { 169 return aChar; 170 } 171 return mozilla::unicode::GetNaked(aChar); 172 } 173 174 void ToNaked(nsAString& aString) { 175 uint32_t i = 0; 176 while (i < aString.Length()) { 177 uint32_t ch = aString[i]; 178 if (i < aString.Length() - 1 && NS_IS_SURROGATE_PAIR(ch, aString[i + 1])) { 179 ch = SURROGATE_TO_UCS4(ch, aString[i + 1]); 180 if (mozilla::unicode::IsCombiningDiacritic(ch)) { 181 aString.Cut(i, 2); 182 } else { 183 ch = mozilla::unicode::GetNaked(ch); 184 NS_ASSERTION(!IS_IN_BMP(ch), "stripping crossed BMP/SMP boundary!"); 185 aString.Replace(i++, 1, H_SURROGATE(ch)); 186 aString.Replace(i++, 1, L_SURROGATE(ch)); 187 } 188 continue; 189 } 190 if (mozilla::unicode::IsCombiningDiacritic(ch)) { 191 aString.Cut(i, 1); 192 } else { 193 aString.Replace(i++, 1, ToNaked(ch)); 194 } 195 } 196 } 197 198 int32_t nsCaseInsensitiveStringComparator(const char16_t* lhs, 199 const char16_t* rhs, size_t lLength, 200 size_t rLength) { 201 return (lLength == rLength) ? CaseInsensitiveCompare(lhs, rhs, lLength) 202 : (lLength > rLength) ? 1 203 : -1; 204 } 205 206 int32_t nsCaseInsensitiveUTF8StringComparator(const char* lhs, const char* rhs, 207 size_t lLength, size_t rLength) { 208 return CaseInsensitiveCompare(lhs, rhs, lLength, rLength); 209 } 210 211 int32_t nsASCIICaseInsensitiveStringComparator(const char16_t* lhs, 212 const char16_t* rhs, 213 size_t lLength, size_t rLength) { 214 if (lLength != rLength) { 215 if (lLength > rLength) return 1; 216 return -1; 217 } 218 219 while (rLength) { 220 // we don't care about surrogates here, because we're only 221 // lowercasing the ASCII range 222 char16_t l = *lhs++; 223 char16_t r = *rhs++; 224 if (l != r) { 225 l = ToLowerCaseASCII_inline(l); 226 r = ToLowerCaseASCII_inline(r); 227 228 if (l > r) 229 return 1; 230 else if (r > l) 231 return -1; 232 } 233 rLength--; 234 } 235 236 return 0; 237 } 238 239 #endif // MOZILLA_INTERNAL_API 240 241 uint32_t ToLowerCase(uint32_t aChar) { return ToLowerCase_inline(aChar); } 242 243 void ToLowerCase(const char16_t* aIn, char16_t* aOut, size_t aLen) { 244 for (size_t i = 0; i < aLen; i++) { 245 uint32_t ch = aIn[i]; 246 if (i < aLen - 1 && NS_IS_SURROGATE_PAIR(ch, aIn[i + 1])) { 247 ch = mozilla::intl::UnicodeProperties::ToLower( 248 SURROGATE_TO_UCS4(ch, aIn[i + 1])); 249 NS_ASSERTION(!IS_IN_BMP(ch), "case mapping crossed BMP/SMP boundary!"); 250 aOut[i++] = H_SURROGATE(ch); 251 aOut[i] = L_SURROGATE(ch); 252 continue; 253 } 254 aOut[i] = ToLowerCase(ch); 255 } 256 } 257 258 void ToLowerCaseASCII(const char16_t* aIn, char16_t* aOut, size_t aLen) { 259 for (size_t i = 0; i < aLen; i++) { 260 char16_t ch = aIn[i]; 261 aOut[i] = IS_ASCII_UPPER(ch) ? (ch + 0x20) : ch; 262 } 263 } 264 265 uint32_t ToUpperCase(uint32_t aChar) { 266 if (IS_ASCII(aChar)) { 267 if (IS_ASCII_LOWER(aChar)) { 268 return aChar - 0x20; 269 } 270 return aChar; 271 } 272 273 return mozilla::intl::UnicodeProperties::ToUpper(aChar); 274 } 275 276 void ToUpperCase(const char16_t* aIn, char16_t* aOut, size_t aLen) { 277 for (size_t i = 0; i < aLen; i++) { 278 uint32_t ch = aIn[i]; 279 if (i < aLen - 1 && NS_IS_SURROGATE_PAIR(ch, aIn[i + 1])) { 280 ch = mozilla::intl::UnicodeProperties::ToUpper( 281 SURROGATE_TO_UCS4(ch, aIn[i + 1])); 282 NS_ASSERTION(!IS_IN_BMP(ch), "case mapping crossed BMP/SMP boundary!"); 283 aOut[i++] = H_SURROGATE(ch); 284 aOut[i] = L_SURROGATE(ch); 285 continue; 286 } 287 aOut[i] = ToUpperCase(ch); 288 } 289 } 290 291 uint32_t ToTitleCase(uint32_t aChar) { 292 if (IS_ASCII(aChar)) { 293 return ToUpperCase(aChar); 294 } 295 296 return mozilla::unicode::GetTitlecaseForLower(aChar); 297 } 298 299 int32_t CaseInsensitiveCompare(const char16_t* a, const char16_t* b, 300 size_t len) { 301 NS_ASSERTION(a && b, "Do not pass in invalid pointers!"); 302 303 if (len) { 304 do { 305 uint32_t c1 = *a++; 306 uint32_t c2 = *b++; 307 308 // Unfortunately, we need to check for surrogates BEFORE we check 309 // for equality, because we could have identical high surrogates 310 // but non-identical characters, so we can't just skip them 311 312 // If c1 isn't a surrogate, we don't bother to check c2; 313 // in the case where it _is_ a surrogate, we're definitely going to get 314 // a mismatch, and don't need to interpret and lowercase it 315 316 if (len > 1 && NS_IS_SURROGATE_PAIR(c1, *a)) { 317 c1 = SURROGATE_TO_UCS4(c1, *a++); 318 if (NS_IS_SURROGATE_PAIR(c2, *b)) { 319 c2 = SURROGATE_TO_UCS4(c2, *b++); 320 } 321 // If c2 wasn't a surrogate, decrementing len means we'd stop 322 // short of the end of string b, but that doesn't actually matter 323 // because we're going to find a mismatch and return early 324 --len; 325 } 326 327 if (c1 != c2) { 328 c1 = ToLowerCase_inline(c1); 329 c2 = ToLowerCase_inline(c2); 330 if (c1 != c2) { 331 if (c1 < c2) { 332 return -1; 333 } 334 return 1; 335 } 336 } 337 } while (--len != 0); 338 } 339 return 0; 340 } 341 342 // Inlined definition of GetLowerUTF8Codepoint, which we use because we want 343 // to be fast when called from the case-insensitive comparators. 344 static MOZ_ALWAYS_INLINE uint32_t GetLowerUTF8Codepoint_inline( 345 const char* aStr, const char* aEnd, const char** aNext) { 346 // Convert to unsigned char so that stuffing chars into PRUint32s doesn't 347 // sign extend. 348 const unsigned char* str = (unsigned char*)aStr; 349 350 if (UTF8traits::isASCII(str[0])) { 351 // It's ASCII; just convert to lower-case and return it. 352 *aNext = aStr + 1; 353 return gASCIIToLower[*str]; 354 } 355 if (UTF8traits::is2byte(str[0]) && MOZ_LIKELY(aStr + 1 < aEnd)) { 356 // It's a two-byte sequence, so it looks like 357 // 110XXXXX 10XXXXXX. 358 // This is definitely in the BMP, so we can store straightaway into a 359 // uint16_t. 360 361 uint16_t c; 362 c = (str[0] & 0x1F) << 6; 363 c += (str[1] & 0x3F); 364 365 // we don't go through ToLowerCase here, because we know this isn't 366 // an ASCII character so the ASCII fast-path there is useless 367 c = mozilla::intl::UnicodeProperties::ToLower(c); 368 369 *aNext = aStr + 2; 370 return c; 371 } 372 if (UTF8traits::is3byte(str[0]) && MOZ_LIKELY(aStr + 2 < aEnd)) { 373 // It's a three-byte sequence, so it looks like 374 // 1110XXXX 10XXXXXX 10XXXXXX. 375 // This will just barely fit into 16-bits, so store into a uint16_t. 376 377 uint16_t c; 378 c = (str[0] & 0x0F) << 12; 379 c += (str[1] & 0x3F) << 6; 380 c += (str[2] & 0x3F); 381 382 c = mozilla::intl::UnicodeProperties::ToLower(c); 383 384 *aNext = aStr + 3; 385 return c; 386 } 387 if (UTF8traits::is4byte(str[0]) && MOZ_LIKELY(aStr + 3 < aEnd)) { 388 // It's a four-byte sequence, so it looks like 389 // 11110XXX 10XXXXXX 10XXXXXX 10XXXXXX. 390 391 uint32_t c; 392 c = (str[0] & 0x07) << 18; 393 c += (str[1] & 0x3F) << 12; 394 c += (str[2] & 0x3F) << 6; 395 c += (str[3] & 0x3F); 396 397 c = mozilla::intl::UnicodeProperties::ToLower(c); 398 399 *aNext = aStr + 4; 400 return c; 401 } 402 403 // Hm, we don't understand this sequence. 404 return -1; 405 } 406 407 uint32_t GetLowerUTF8Codepoint(const char* aStr, const char* aEnd, 408 const char** aNext) { 409 return GetLowerUTF8Codepoint_inline(aStr, aEnd, aNext); 410 } 411 412 int32_t CaseInsensitiveCompare(const char* aLeft, const char* aRight, 413 size_t aLeftBytes, size_t aRightBytes) { 414 const char* leftEnd = aLeft + aLeftBytes; 415 const char* rightEnd = aRight + aRightBytes; 416 417 while (aLeft < leftEnd && aRight < rightEnd) { 418 uint32_t leftChar = GetLowerUTF8Codepoint_inline(aLeft, leftEnd, &aLeft); 419 if (MOZ_UNLIKELY(leftChar == uint32_t(-1))) return -1; 420 421 uint32_t rightChar = 422 GetLowerUTF8Codepoint_inline(aRight, rightEnd, &aRight); 423 if (MOZ_UNLIKELY(rightChar == uint32_t(-1))) return -1; 424 425 // Now leftChar and rightChar are lower-case, so we can compare them. 426 if (leftChar != rightChar) { 427 if (leftChar > rightChar) return 1; 428 return -1; 429 } 430 } 431 432 // Make sure that if one string is longer than the other we return the 433 // correct result. 434 if (aLeft < leftEnd) return 1; 435 if (aRight < rightEnd) return -1; 436 437 return 0; 438 } 439 440 static MOZ_ALWAYS_INLINE uint32_t 441 GetLowerUTF8Codepoint_inline(const char* aStr, const char* aEnd, 442 const char** aNext, bool aMatchDiacritics) { 443 uint32_t c; 444 for (;;) { 445 c = GetLowerUTF8Codepoint_inline(aStr, aEnd, aNext); 446 if (aMatchDiacritics) { 447 break; 448 } 449 if (!mozilla::unicode::IsCombiningDiacritic(c)) { 450 break; 451 } 452 aStr = *aNext; 453 } 454 return c; 455 } 456 457 bool CaseInsensitiveUTF8CharsEqual(const char* aLeft, const char* aRight, 458 const char* aLeftEnd, const char* aRightEnd, 459 const char** aLeftNext, 460 const char** aRightNext, bool* aErr, 461 bool aMatchDiacritics) { 462 NS_ASSERTION(aLeftNext, "Out pointer shouldn't be null."); 463 NS_ASSERTION(aRightNext, "Out pointer shouldn't be null."); 464 NS_ASSERTION(aErr, "Out pointer shouldn't be null."); 465 NS_ASSERTION(aLeft < aLeftEnd, "aLeft must be less than aLeftEnd."); 466 NS_ASSERTION(aRight < aRightEnd, "aRight must be less than aRightEnd."); 467 468 uint32_t leftChar = GetLowerUTF8Codepoint_inline(aLeft, aLeftEnd, aLeftNext, 469 aMatchDiacritics); 470 if (MOZ_UNLIKELY(leftChar == uint32_t(-1))) { 471 *aErr = true; 472 return false; 473 } 474 475 uint32_t rightChar = GetLowerUTF8Codepoint_inline( 476 aRight, aRightEnd, aRightNext, aMatchDiacritics); 477 if (MOZ_UNLIKELY(rightChar == uint32_t(-1))) { 478 *aErr = true; 479 return false; 480 } 481 482 // Can't have an error past this point. 483 *aErr = false; 484 485 if (!aMatchDiacritics) { 486 leftChar = ToNaked(leftChar); 487 rightChar = ToNaked(rightChar); 488 } 489 490 return leftChar == rightChar; 491 } 492 493 namespace mozilla { 494 495 uint32_t HashUTF8AsUTF16(const char* aUTF8, size_t aLength, bool* aErr) { 496 uint32_t hash = 0; 497 const char* s = aUTF8; 498 const char* end = aUTF8 + aLength; 499 500 *aErr = false; 501 502 while (s < end) { 503 uint32_t ucs4 = UTF8CharEnumerator::NextChar(&s, end, aErr); 504 if (*aErr) { 505 return 0; 506 } 507 508 if (ucs4 < PLANE1_BASE) { 509 hash = AddToHash(hash, ucs4); 510 } else { 511 hash = AddToHash(hash, H_SURROGATE(ucs4), L_SURROGATE(ucs4)); 512 } 513 } 514 515 return hash; 516 } 517 518 // The Korean Won currency sign has East Asian Width = HALFWIDTH, and 519 // Script = COMMON (rather than HANGUL), but we don't want to treat it like 520 // Chinese/Japanese half-width characters for segment break transformation, 521 // so we exclude it individually in the two functions here. 522 static constexpr uint32_t kWonCurrencySign = 0x20A9; 523 524 bool IsSegmentBreakSkipChar(uint32_t u) { 525 return intl::UnicodeProperties::IsEastAsianWidthFHWexcludingEmoji(u) && 526 intl::UnicodeProperties::GetScriptCode(u) != intl::Script::HANGUL && 527 u != kWonCurrencySign; 528 } 529 530 bool IsEastAsianPunctuation(uint32_t u) { 531 // U+FF5E FULLWIDTH TILDE has General Category = Symbol (not Punctuation), 532 // but is used similarly to U+301C WAVE DASH (which does have category 533 // Punctuation). So we treat FULLWIDTH TILDE as punctuation here to give the 534 // two characters consistent behavior. 535 constexpr uint32_t kFullwidthTilde = 0xFF5E; 536 // U+3000 IDEOGRAPHIC SPACE has General Category = Zs (not Punctuation), 537 // but it conflicts with a JLReq rule that space added after 538 // question or exclamation mark is stipulated to be full-width if line is 539 // broken after full-width space following such a punctuation mark but 540 // line break is replaced by a space. So we treat IDEOGRAPHIC SPACE as 541 // punctuation here to allow line breaks after it while maintaining 542 // compatibility with JLReq. 543 constexpr uint32_t kIdeographicSpace = 0x3000; 544 return intl::UnicodeProperties::IsEastAsianWidthFHW(u) && 545 ((intl::UnicodeProperties::IsPunctuation(u) && 546 u != kWonCurrencySign) || 547 u == kFullwidthTilde || u == kIdeographicSpace); 548 } 549 550 bool IsPunctuationForWordSelect(char16_t aCh) { 551 const uint8_t cat = unicode::GetGeneralCategory(aCh); 552 switch (cat) { 553 case HB_UNICODE_GENERAL_CATEGORY_CONNECT_PUNCTUATION: /* Pc */ 554 if (aCh == '_' && !StaticPrefs::layout_word_select_stop_at_underscore()) { 555 return false; 556 } 557 [[fallthrough]]; 558 case HB_UNICODE_GENERAL_CATEGORY_DASH_PUNCTUATION: /* Pd */ 559 case HB_UNICODE_GENERAL_CATEGORY_CLOSE_PUNCTUATION: /* Pe */ 560 case HB_UNICODE_GENERAL_CATEGORY_FINAL_PUNCTUATION: /* Pf */ 561 case HB_UNICODE_GENERAL_CATEGORY_INITIAL_PUNCTUATION: /* Pi */ 562 case HB_UNICODE_GENERAL_CATEGORY_OTHER_PUNCTUATION: /* Po */ 563 case HB_UNICODE_GENERAL_CATEGORY_OPEN_PUNCTUATION: /* Ps */ 564 case HB_UNICODE_GENERAL_CATEGORY_CURRENCY_SYMBOL: /* Sc */ 565 // Deliberately omitted: 566 // case HB_UNICODE_GENERAL_CATEGORY_MODIFIER_SYMBOL: /* Sk */ 567 case HB_UNICODE_GENERAL_CATEGORY_MATH_SYMBOL: /* Sm */ 568 case HB_UNICODE_GENERAL_CATEGORY_OTHER_SYMBOL: /* So */ 569 return true; 570 default: 571 return false; 572 } 573 } 574 575 } // namespace mozilla