Locale.cpp (46459B)
1 /* This Source Code Form is subject to the terms of the Mozilla Public 2 * License, v. 2.0. If a copy of the MPL was not distributed with this 3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 4 5 #include "mozilla/intl/Locale.h" 6 7 #include "mozilla/Assertions.h" 8 #include "mozilla/Span.h" 9 #include "mozilla/TextUtils.h" 10 #include "mozilla/Variant.h" 11 12 #include "ICU4CGlue.h" 13 14 #include <algorithm> 15 #include <iterator> 16 #include <stddef.h> 17 #include <stdint.h> 18 #include <string.h> 19 #include <utility> 20 21 #include "unicode/uloc.h" 22 #include "unicode/utypes.h" 23 24 namespace mozilla::intl { 25 26 using namespace intl::LanguageTagLimits; 27 28 template <typename CharT> 29 bool IsStructurallyValidLanguageTag(Span<const CharT> aLanguage) { 30 // unicode_language_subtag = alpha{2,3} | alpha{5,8}; 31 size_t length = aLanguage.size(); 32 const CharT* str = aLanguage.data(); 33 return ((2 <= length && length <= 3) || (5 <= length && length <= 8)) && 34 std::all_of(str, str + length, IsAsciiAlpha<CharT>); 35 } 36 37 template bool IsStructurallyValidLanguageTag(Span<const char> aLanguage); 38 template bool IsStructurallyValidLanguageTag(Span<const Latin1Char> aLanguage); 39 template bool IsStructurallyValidLanguageTag(Span<const char16_t> aLanguage); 40 41 template <typename CharT> 42 bool IsStructurallyValidScriptTag(Span<const CharT> aScript) { 43 // unicode_script_subtag = alpha{4} ; 44 size_t length = aScript.size(); 45 const CharT* str = aScript.data(); 46 return length == 4 && std::all_of(str, str + length, IsAsciiAlpha<CharT>); 47 } 48 49 template bool IsStructurallyValidScriptTag(Span<const char> aScript); 50 template bool IsStructurallyValidScriptTag(Span<const Latin1Char> aScript); 51 template bool IsStructurallyValidScriptTag(Span<const char16_t> aScript); 52 53 template <typename CharT> 54 bool IsStructurallyValidRegionTag(Span<const CharT> aRegion) { 55 // unicode_region_subtag = (alpha{2} | digit{3}) ; 56 size_t length = aRegion.size(); 57 const CharT* str = aRegion.data(); 58 return (length == 2 && std::all_of(str, str + length, IsAsciiAlpha<CharT>)) || 59 (length == 3 && std::all_of(str, str + length, IsAsciiDigit<CharT>)); 60 } 61 62 template bool IsStructurallyValidRegionTag(Span<const char> aRegion); 63 template bool IsStructurallyValidRegionTag(Span<const Latin1Char> aRegion); 64 template bool IsStructurallyValidRegionTag(Span<const char16_t> aRegion); 65 66 template <typename CharT> 67 bool IsStructurallyValidVariantTag(Span<const CharT> aVariant) { 68 // unicode_variant_subtag = (alphanum{5,8} | digit alphanum{3}) ; 69 size_t length = aVariant.size(); 70 const CharT* str = aVariant.data(); 71 return ((5 <= length && length <= 8) || 72 (length == 4 && IsAsciiDigit(str[0]))) && 73 std::all_of(str, str + length, IsAsciiAlphanumeric<CharT>); 74 } 75 76 template bool IsStructurallyValidVariantTag(Span<const char> aVariant); 77 template bool IsStructurallyValidVariantTag(Span<const Latin1Char> aVariant); 78 template bool IsStructurallyValidVariantTag(Span<const char16_t> aVariant); 79 80 #ifdef DEBUG 81 bool IsStructurallyValidUnicodeExtensionTag(Span<const char> aExtension) { 82 return LocaleParser::CanParseUnicodeExtension(aExtension).isOk(); 83 } 84 85 static bool IsStructurallyValidExtensionTag(Span<const char> aExtension) { 86 // other_extensions = sep [alphanum-[tTuUxX]] (sep alphanum{2,8})+ ; 87 // NB: Allow any extension, including Unicode and Transform here, because 88 // this function is only used for an assertion. 89 90 size_t length = aExtension.size(); 91 const char* str = aExtension.data(); 92 const char* const end = aExtension.data() + length; 93 if (length <= 2) { 94 return false; 95 } 96 if (!IsAsciiAlphanumeric(str[0]) || str[0] == 'x' || str[0] == 'X') { 97 return false; 98 } 99 str++; 100 if (*str++ != '-') { 101 return false; 102 } 103 while (true) { 104 const char* sep = 105 reinterpret_cast<const char*>(memchr(str, '-', end - str)); 106 size_t len = (sep ? sep : end) - str; 107 if (len < 2 || len > 8 || 108 !std::all_of(str, str + len, IsAsciiAlphanumeric<char>)) { 109 return false; 110 } 111 if (!sep) { 112 return true; 113 } 114 str = sep + 1; 115 } 116 } 117 118 bool IsStructurallyValidPrivateUseTag(Span<const char> aPrivateUse) { 119 // pu_extensions = sep [xX] (sep alphanum{1,8})+ ; 120 121 size_t length = aPrivateUse.size(); 122 const char* str = aPrivateUse.data(); 123 const char* const end = aPrivateUse.data() + length; 124 if (length <= 2) { 125 return false; 126 } 127 if (str[0] != 'x' && str[0] != 'X') { 128 return false; 129 } 130 str++; 131 if (*str++ != '-') { 132 return false; 133 } 134 while (true) { 135 const char* sep = 136 reinterpret_cast<const char*>(memchr(str, '-', end - str)); 137 size_t len = (sep ? sep : end) - str; 138 if (len == 0 || len > 8 || 139 !std::all_of(str, str + len, IsAsciiAlphanumeric<char>)) { 140 return false; 141 } 142 if (!sep) { 143 return true; 144 } 145 str = sep + 1; 146 } 147 } 148 #endif 149 150 ptrdiff_t Locale::UnicodeExtensionIndex() const { 151 // The extension subtags aren't necessarily sorted, so we can't use binary 152 // search here. 153 auto p = std::find_if( 154 mExtensions.begin(), mExtensions.end(), 155 [](const auto& ext) { return ext[0] == 'u' || ext[0] == 'U'; }); 156 if (p != mExtensions.end()) { 157 return std::distance(mExtensions.begin(), p); 158 } 159 return -1; 160 } 161 162 Maybe<Span<const char>> Locale::GetUnicodeExtension() const { 163 ptrdiff_t index = UnicodeExtensionIndex(); 164 if (index >= 0) { 165 return Some(MakeStringSpan(mExtensions[index].get())); 166 } 167 return Nothing(); 168 } 169 170 ICUResult Locale::SetUnicodeExtension(Span<const char> aExtension) { 171 MOZ_ASSERT(IsStructurallyValidUnicodeExtensionTag(aExtension)); 172 173 auto duplicated = DuplicateStringToUniqueChars(aExtension); 174 175 // Replace the existing Unicode extension subtag or append a new one. 176 ptrdiff_t index = UnicodeExtensionIndex(); 177 if (index >= 0) { 178 mExtensions[index] = std::move(duplicated); 179 return Ok(); 180 } 181 if (!mExtensions.append(std::move(duplicated))) { 182 return Err(ICUError::OutOfMemory); 183 } 184 return Ok(); 185 } 186 187 void Locale::ClearUnicodeExtension() { 188 ptrdiff_t index = UnicodeExtensionIndex(); 189 if (index >= 0) { 190 mExtensions.erase(mExtensions.begin() + index); 191 } 192 } 193 194 template <size_t InitialCapacity> 195 static void SortAlphabetically( 196 Vector<VariantSubtag, InitialCapacity>& aVariants) { 197 size_t length = aVariants.length(); 198 199 // Zero or one element lists are already sorted. 200 if (length < 2) { 201 return; 202 } 203 204 // Handle two element lists inline. 205 if (length == 2) { 206 if (aVariants[0].Span() > aVariants[1].Span()) { 207 std::swap(aVariants[0], aVariants[1]); 208 } 209 return; 210 } 211 212 std::stable_sort( 213 aVariants.begin(), aVariants.end(), 214 [](const auto& a, const auto& b) { return a.Span() < b.Span(); }); 215 } 216 217 template <size_t InitialCapacity> 218 static bool SortAlphabetically(Vector<UniqueChars, InitialCapacity>& aSubtags) { 219 size_t length = aSubtags.length(); 220 221 // Zero or one element lists are already sorted. 222 if (length < 2) { 223 return true; 224 } 225 226 // Handle two element lists inline. 227 if (length == 2) { 228 if (strcmp(aSubtags[0].get(), aSubtags[1].get()) > 0) { 229 aSubtags[0].swap(aSubtags[1]); 230 } 231 return true; 232 } 233 234 Vector<char*, 8> scratch; 235 if (!scratch.resizeUninitialized(length)) { 236 return false; 237 } 238 for (size_t i = 0; i < length; i++) { 239 scratch[i] = aSubtags[i].release(); 240 } 241 242 std::stable_sort( 243 scratch.begin(), scratch.end(), 244 [](const char* a, const char* b) { return strcmp(a, b) < 0; }); 245 246 for (size_t i = 0; i < length; i++) { 247 aSubtags[i] = UniqueChars(scratch[i]); 248 } 249 return true; 250 } 251 252 Result<Ok, Locale::CanonicalizationError> Locale::CanonicalizeBaseName() { 253 // Per 6.2.3 CanonicalizeUnicodeLocaleId, the very first step is to 254 // canonicalize the syntax by normalizing the case and ordering all subtags. 255 // The canonical syntax form is specified in UTS 35, 3.2.1. 256 257 // Language codes need to be in lower case. "JA" -> "ja" 258 mLanguage.ToLowerCase(); 259 MOZ_ASSERT(IsStructurallyValidLanguageTag(Language().Span())); 260 261 // The first character of a script code needs to be capitalized. 262 // "hans" -> "Hans" 263 mScript.ToTitleCase(); 264 MOZ_ASSERT(Script().Missing() || 265 IsStructurallyValidScriptTag(Script().Span())); 266 267 // Region codes need to be in upper case. "bu" -> "BU" 268 mRegion.ToUpperCase(); 269 MOZ_ASSERT(Region().Missing() || 270 IsStructurallyValidRegionTag(Region().Span())); 271 272 // The canonical case for variant subtags is lowercase. 273 for (auto& variant : mVariants) { 274 variant.ToLowerCase(); 275 MOZ_ASSERT(IsStructurallyValidVariantTag(variant.Span())); 276 } 277 278 // Extensions and privateuse subtags are case normalized in the 279 // |canonicalizeExtensions| method. 280 281 // The second step in UTS 35, 3.2.1, is to order all subtags. 282 283 if (mVariants.length() > 1) { 284 // 1. Any variants are in alphabetical order. 285 SortAlphabetically(mVariants); 286 287 // Reject the Locale identifier if a duplicate variant was found, e.g. 288 // "en-variant-Variant". 289 const auto* duplicate = std::adjacent_find( 290 mVariants.begin(), mVariants.end(), 291 [](const auto& a, const auto& b) { return a.Span() == b.Span(); }); 292 if (duplicate != mVariants.end()) { 293 return Err(CanonicalizationError::DuplicateVariant); 294 } 295 } 296 297 // 2. Any extensions are in alphabetical order by their singleton. 298 // 3. All attributes are sorted in alphabetical order. 299 // 4. All keywords and tfields are sorted by alphabetical order of their keys, 300 // within their respective extensions. 301 // 5. Any type or tfield value "true" is removed. 302 // - A subsequent call to canonicalizeExtensions() will perform these steps. 303 304 // 6.2.3 CanonicalizeUnicodeLocaleId, step 2 transforms the locale identifier 305 // into its canonical form per UTS 3.2.1. 306 307 // 1. Use the bcp47 data to replace keys, types, tfields, and tvalues by their 308 // canonical forms. 309 // - A subsequent call to canonicalizeExtensions() will perform this step. 310 311 // 2. Replace aliases in the unicode_language_id and tlang (if any). 312 // - tlang is handled in canonicalizeExtensions(). 313 314 // Replace deprecated language, region, and variant subtags with their 315 // preferred mappings. 316 317 if (!UpdateLegacyMappings()) { 318 return Err(CanonicalizationError::OutOfMemory); 319 } 320 321 // Replace deprecated language subtags with their preferred values. 322 if (!LanguageMapping(mLanguage) && ComplexLanguageMapping(mLanguage)) { 323 PerformComplexLanguageMappings(); 324 } 325 326 // Replace deprecated script subtags with their preferred values. 327 if (Script().Present()) { 328 ScriptMapping(mScript); 329 } 330 331 // Replace deprecated region subtags with their preferred values. 332 if (Region().Present()) { 333 if (!RegionMapping(mRegion) && ComplexRegionMapping(mRegion)) { 334 PerformComplexRegionMappings(); 335 } 336 } 337 338 // Replace deprecated variant subtags with their preferred values. 339 if (!PerformVariantMappings()) { 340 return Err(CanonicalizationError::OutOfMemory); 341 } 342 343 // No extension replacements are currently present. 344 // Private use sequences are left as is. 345 346 // 3. Replace aliases in special key values. 347 // - A subsequent call to canonicalizeExtensions() will perform this step. 348 349 return Ok(); 350 } 351 352 #ifdef DEBUG 353 static bool IsAsciiLowercaseAlphanumericOrDash(Span<const char> aSpan) { 354 const char* ptr = aSpan.data(); 355 size_t length = aSpan.size(); 356 return std::all_of(ptr, ptr + length, [](auto c) { 357 return IsAsciiLowercaseAlpha(c) || IsAsciiDigit(c) || c == '-'; 358 }); 359 } 360 #endif 361 362 Result<Ok, Locale::CanonicalizationError> Locale::CanonicalizeExtensions() { 363 // The canonical case for all extension subtags is lowercase. 364 for (UniqueChars& extension : mExtensions) { 365 char* extensionChars = extension.get(); 366 size_t extensionLength = strlen(extensionChars); 367 AsciiToLowerCase(extensionChars, extensionLength, extensionChars); 368 369 MOZ_ASSERT( 370 IsStructurallyValidExtensionTag({extensionChars, extensionLength})); 371 } 372 373 // Any extensions are in alphabetical order by their singleton. 374 // "u-ca-chinese-t-zh-latn" -> "t-zh-latn-u-ca-chinese" 375 if (!SortAlphabetically(mExtensions)) { 376 return Err(CanonicalizationError::OutOfMemory); 377 } 378 379 for (UniqueChars& extension : mExtensions) { 380 if (extension[0] == 'u') { 381 MOZ_TRY(CanonicalizeUnicodeExtension(extension)); 382 } else if (extension[0] == 't') { 383 MOZ_TRY(CanonicalizeTransformExtension(extension)); 384 } 385 386 MOZ_ASSERT( 387 IsAsciiLowercaseAlphanumericOrDash(MakeStringSpan(extension.get()))); 388 } 389 390 // The canonical case for privateuse subtags is lowercase. 391 if (char* privateuse = mPrivateUse.get()) { 392 size_t privateuseLength = strlen(privateuse); 393 AsciiToLowerCase(privateuse, privateuseLength, privateuse); 394 395 MOZ_ASSERT( 396 IsStructurallyValidPrivateUseTag({privateuse, privateuseLength})); 397 } 398 return Ok(); 399 } 400 401 template <size_t N> 402 static inline bool AppendSpan(Vector<char, N>& vector, Span<const char> aSpan) { 403 return vector.append(aSpan.data(), aSpan.size()); 404 } 405 406 /** 407 * CanonicalizeUnicodeExtension( attributes, keywords ) 408 * 409 * Canonical syntax per 410 * <https://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers>: 411 * 412 * - All attributes and keywords are in lowercase. 413 * - Note: The parser already converted keywords to lowercase. 414 * - All attributes are sorted in alphabetical order. 415 * - All keywords are sorted by alphabetical order of their keys. 416 * - Any type value "true" is removed. 417 * 418 * Canonical form: 419 * - All keys and types use the canonical form (from the name attribute; 420 * see Section 3.6.4 U Extension Data Files). 421 */ 422 Result<Ok, Locale::CanonicalizationError> Locale::CanonicalizeUnicodeExtension( 423 UniqueChars& aUnicodeExtension) { 424 Span<const char> extension = MakeStringSpan(aUnicodeExtension.get()); 425 MOZ_ASSERT(extension[0] == 'u'); 426 MOZ_ASSERT(extension[1] == '-'); 427 MOZ_ASSERT(IsStructurallyValidExtensionTag(extension)); 428 429 LocaleParser::AttributesVector attributes; 430 LocaleParser::KeywordsVector keywords; 431 432 using Attribute = LocaleParser::AttributesVector::ElementType; 433 using Keyword = LocaleParser::KeywordsVector::ElementType; 434 435 if (LocaleParser::ParseUnicodeExtension(extension, attributes, keywords) 436 .isErr()) { 437 MOZ_ASSERT_UNREACHABLE("unexpected invalid Unicode extension subtag"); 438 return Err(CanonicalizationError::InternalError); 439 } 440 441 auto attributesLess = [extension](const Attribute& a, const Attribute& b) { 442 auto astr = extension.Subspan(a.Begin(), a.Length()); 443 auto bstr = extension.Subspan(b.Begin(), b.Length()); 444 return astr < bstr; 445 }; 446 447 // All attributes are sorted in alphabetical order. 448 if (attributes.length() > 1) { 449 std::stable_sort(attributes.begin(), attributes.end(), attributesLess); 450 } 451 452 auto keywordsLess = [extension](const Keyword& a, const Keyword& b) { 453 auto astr = extension.Subspan(a.Begin(), UnicodeKeyLength); 454 auto bstr = extension.Subspan(b.Begin(), UnicodeKeyLength); 455 return astr < bstr; 456 }; 457 458 // All keywords are sorted by alphabetical order of keys. 459 if (keywords.length() > 1) { 460 // Using a stable sort algorithm, guarantees that two keywords using the 461 // same key are never reordered. That means for example 462 // when we have the input "u-nu-thai-kf-false-nu-latn", we are guaranteed to 463 // get the result "u-kf-false-nu-thai-nu-latn", i.e. "nu-thai" still occurs 464 // before "nu-latn". 465 // This is required so that deduplication below preserves the first keyword 466 // for a given key and discards the rest. 467 std::stable_sort(keywords.begin(), keywords.end(), keywordsLess); 468 } 469 470 Vector<char, 32> sb; 471 if (!sb.append('u')) { 472 return Err(CanonicalizationError::OutOfMemory); 473 } 474 475 // Append all Unicode extension attributes. 476 for (size_t i = 0; i < attributes.length(); i++) { 477 const auto& attribute = attributes[i]; 478 auto span = extension.Subspan(attribute.Begin(), attribute.Length()); 479 480 // Skip duplicate attributes. 481 if (i > 0) { 482 const auto& lastAttribute = attributes[i - 1]; 483 if (span == 484 extension.Subspan(lastAttribute.Begin(), lastAttribute.Length())) { 485 continue; 486 } 487 MOZ_ASSERT(attributesLess(lastAttribute, attribute)); 488 } 489 490 if (!sb.append('-')) { 491 return Err(CanonicalizationError::OutOfMemory); 492 } 493 if (!AppendSpan(sb, span)) { 494 return Err(CanonicalizationError::OutOfMemory); 495 } 496 } 497 498 static constexpr size_t UnicodeKeyWithSepLength = UnicodeKeyLength + 1; 499 500 using StringSpan = Span<const char>; 501 502 static constexpr StringSpan True = MakeStringSpan("true"); 503 504 // Append all Unicode extension keywords. 505 for (size_t i = 0; i < keywords.length(); i++) { 506 const auto& keyword = keywords[i]; 507 508 // Skip duplicate keywords. 509 if (i > 0) { 510 const auto& lastKeyword = keywords[i - 1]; 511 if (extension.Subspan(keyword.Begin(), UnicodeKeyLength) == 512 extension.Subspan(lastKeyword.Begin(), UnicodeKeyLength)) { 513 continue; 514 } 515 MOZ_ASSERT(keywordsLess(lastKeyword, keyword)); 516 } 517 518 if (!sb.append('-')) { 519 return Err(CanonicalizationError::OutOfMemory); 520 } 521 522 StringSpan span = extension.Subspan(keyword.Begin(), keyword.Length()); 523 if (span.size() == UnicodeKeyLength) { 524 // Keyword without type value. 525 if (!AppendSpan(sb, span)) { 526 return Err(CanonicalizationError::OutOfMemory); 527 } 528 } else { 529 StringSpan key = span.To(UnicodeKeyLength); 530 StringSpan type = span.From(UnicodeKeyWithSepLength); 531 532 // Search if there's a replacement for the current Unicode keyword. 533 if (const char* replacement = ReplaceUnicodeExtensionType(key, type)) { 534 StringSpan repl = MakeStringSpan(replacement); 535 if (repl == True) { 536 // Elide the type "true" if present in the replacement. 537 if (!AppendSpan(sb, key)) { 538 return Err(CanonicalizationError::OutOfMemory); 539 } 540 } else { 541 // Otherwise append the Unicode key (including the separator) and the 542 // replaced type. 543 if (!AppendSpan(sb, span.To(UnicodeKeyWithSepLength))) { 544 return Err(CanonicalizationError::OutOfMemory); 545 } 546 if (!AppendSpan(sb, repl)) { 547 return Err(CanonicalizationError::OutOfMemory); 548 } 549 } 550 } else { 551 if (type == True) { 552 // Elide the Unicode extension type "true". 553 if (!AppendSpan(sb, key)) { 554 return Err(CanonicalizationError::OutOfMemory); 555 } 556 } else { 557 // Otherwise append the complete Unicode extension keyword. 558 if (!AppendSpan(sb, span)) { 559 return Err(CanonicalizationError::OutOfMemory); 560 } 561 } 562 } 563 } 564 } 565 566 // We can keep the previous extension when canonicalization didn't modify it. 567 if (static_cast<Span<const char>>(sb) != extension) { 568 // Otherwise replace the previous extension with the canonical extension. 569 UniqueChars canonical = DuplicateStringToUniqueChars(sb); 570 if (!canonical) { 571 return Err(CanonicalizationError::OutOfMemory); 572 } 573 aUnicodeExtension = std::move(canonical); 574 } 575 576 return Ok(); 577 } 578 579 template <class Buffer> 580 static bool LocaleToString(const Locale& aTag, Buffer& aBuffer) { 581 auto appendSubtag = [&aBuffer](const auto& subtag) { 582 auto span = subtag.Span(); 583 MOZ_ASSERT(!span.empty()); 584 return aBuffer.append(span.data(), span.size()); 585 }; 586 587 auto appendSubtagSpan = [&aBuffer](Span<const char> subtag) { 588 MOZ_ASSERT(!subtag.empty()); 589 return aBuffer.append(subtag.data(), subtag.size()); 590 }; 591 592 auto appendSubtags = [&aBuffer, &appendSubtagSpan](const auto& subtags) { 593 for (const auto& subtag : subtags) { 594 if (!aBuffer.append('-') || !appendSubtagSpan(subtag)) { 595 return false; 596 } 597 } 598 return true; 599 }; 600 601 // Append the language subtag. 602 if (!appendSubtag(aTag.Language())) { 603 return false; 604 } 605 606 // Append the script subtag if present. 607 if (aTag.Script().Present()) { 608 if (!aBuffer.append('-') || !appendSubtag(aTag.Script())) { 609 return false; 610 } 611 } 612 613 // Append the region subtag if present. 614 if (aTag.Region().Present()) { 615 if (!aBuffer.append('-') || !appendSubtag(aTag.Region())) { 616 return false; 617 } 618 } 619 620 // Append the variant subtags if present. 621 if (!appendSubtags(aTag.Variants())) { 622 return false; 623 } 624 625 // Append the extensions subtags if present. 626 if (!appendSubtags(aTag.Extensions())) { 627 return false; 628 } 629 630 // Append the private-use subtag if present. 631 if (auto privateuse = aTag.PrivateUse()) { 632 if (!aBuffer.append('-') || !appendSubtagSpan(privateuse.value())) { 633 return false; 634 } 635 } 636 637 return true; 638 } 639 640 /** 641 * CanonicalizeTransformExtension 642 * 643 * Canonical form per <https://unicode.org/reports/tr35/#BCP47_T_Extension>: 644 * 645 * - These subtags are all in lowercase (that is the canonical casing for these 646 * subtags), [...]. 647 * 648 * And per 649 * <https://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers>: 650 * 651 * - All keywords and tfields are sorted by alphabetical order of their keys, 652 * within their respective extensions. 653 */ 654 Result<Ok, Locale::CanonicalizationError> 655 Locale::CanonicalizeTransformExtension(UniqueChars& aTransformExtension) { 656 Span<const char> extension = MakeStringSpan(aTransformExtension.get()); 657 MOZ_ASSERT(extension[0] == 't'); 658 MOZ_ASSERT(extension[1] == '-'); 659 MOZ_ASSERT(IsStructurallyValidExtensionTag(extension)); 660 661 Locale tag; 662 LocaleParser::TFieldVector fields; 663 664 using TField = LocaleParser::TFieldVector::ElementType; 665 666 if (LocaleParser::ParseTransformExtension(extension, tag, fields).isErr()) { 667 MOZ_ASSERT_UNREACHABLE("unexpected invalid transform extension subtag"); 668 return Err(CanonicalizationError::InternalError); 669 } 670 671 auto tfieldLess = [extension](const TField& a, const TField& b) { 672 auto astr = extension.Subspan(a.Begin(), TransformKeyLength); 673 auto bstr = extension.Subspan(b.Begin(), TransformKeyLength); 674 return astr < bstr; 675 }; 676 677 // All tfields are sorted by alphabetical order of their keys. 678 if (fields.length() > 1) { 679 std::stable_sort(fields.begin(), fields.end(), tfieldLess); 680 } 681 682 Vector<char, 32> sb; 683 if (!sb.append('t')) { 684 return Err(CanonicalizationError::OutOfMemory); 685 } 686 687 // Append the language subtag if present. 688 // 689 // Replace aliases in tlang per 690 // <https://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers>. 691 if (tag.Language().Present()) { 692 if (!sb.append('-')) { 693 return Err(CanonicalizationError::OutOfMemory); 694 } 695 696 MOZ_TRY(tag.CanonicalizeBaseName()); 697 698 // The canonical case for Transform extensions is lowercase per 699 // <https://unicode.org/reports/tr35/#BCP47_T_Extension>. Convert the two 700 // subtags which don't use lowercase for their canonical syntax. 701 tag.mScript.ToLowerCase(); 702 tag.mRegion.ToLowerCase(); 703 704 if (!LocaleToString(tag, sb)) { 705 return Err(CanonicalizationError::OutOfMemory); 706 } 707 } 708 709 static constexpr size_t TransformKeyWithSepLength = TransformKeyLength + 1; 710 711 using StringSpan = Span<const char>; 712 713 // Append all fields. 714 // 715 // UTS 35, 3.2.1 specifies: 716 // - Any type or tfield value "true" is removed. 717 // 718 // But the `tvalue` subtag is mandatory in `tfield: tkey tvalue`, so ignore 719 // this apparently invalid part of the UTS 35 specification and simply 720 // append all `tfield` subtags. 721 for (const auto& field : fields) { 722 if (!sb.append('-')) { 723 return Err(CanonicalizationError::OutOfMemory); 724 } 725 726 StringSpan span = extension.Subspan(field.Begin(), field.Length()); 727 StringSpan key = span.To(TransformKeyLength); 728 StringSpan value = span.From(TransformKeyWithSepLength); 729 730 // Search if there's a replacement for the current transform keyword. 731 if (const char* replacement = ReplaceTransformExtensionType(key, value)) { 732 if (!AppendSpan(sb, span.To(TransformKeyWithSepLength))) { 733 return Err(CanonicalizationError::OutOfMemory); 734 } 735 if (!AppendSpan(sb, MakeStringSpan(replacement))) { 736 return Err(CanonicalizationError::OutOfMemory); 737 } 738 } else { 739 if (!AppendSpan(sb, span)) { 740 return Err(CanonicalizationError::OutOfMemory); 741 } 742 } 743 } 744 745 // We can keep the previous extension when canonicalization didn't modify it. 746 if (static_cast<Span<const char>>(sb) != extension) { 747 // Otherwise replace the previous extension with the canonical extension. 748 UniqueChars canonical = DuplicateStringToUniqueChars(sb); 749 if (!canonical) { 750 return Err(CanonicalizationError::OutOfMemory); 751 } 752 aTransformExtension = std::move(canonical); 753 } 754 755 return Ok(); 756 } 757 758 // Zero-terminated ICU Locale ID. 759 using LocaleId = 760 Vector<char, LanguageLength + 1 + ScriptLength + 1 + RegionLength + 1>; 761 762 enum class LikelySubtags : bool { Add, Remove }; 763 764 // Return true iff the locale is already maximized resp. minimized. 765 static bool HasLikelySubtags(LikelySubtags aLikelySubtags, const Locale& aTag) { 766 // The locale is already maximized if the language, script, and region 767 // subtags are present and no placeholder subtags ("und", "Zzzz", "ZZ") are 768 // used. 769 if (aLikelySubtags == LikelySubtags::Add) { 770 return !aTag.Language().EqualTo("und") && 771 (aTag.Script().Present() && !aTag.Script().EqualTo("Zzzz")) && 772 (aTag.Region().Present() && !aTag.Region().EqualTo("ZZ")); 773 } 774 775 // The locale is already minimized if it only contains a language 776 // subtag whose value is not the placeholder value "und". 777 return !aTag.Language().EqualTo("und") && aTag.Script().Missing() && 778 aTag.Region().Missing(); 779 } 780 781 // Create an ICU locale ID from the given locale. 782 static bool CreateLocaleForLikelySubtags(const Locale& aTag, 783 LocaleId& aLocale) { 784 MOZ_ASSERT(aLocale.length() == 0); 785 786 auto appendSubtag = [&aLocale](const auto& subtag) { 787 auto span = subtag.Span(); 788 MOZ_ASSERT(!span.empty()); 789 return aLocale.append(span.data(), span.size()); 790 }; 791 792 // Append the language subtag. 793 if (!appendSubtag(aTag.Language())) { 794 return false; 795 } 796 797 // Append the script subtag if present. 798 if (aTag.Script().Present()) { 799 if (!aLocale.append('_') || !appendSubtag(aTag.Script())) { 800 return false; 801 } 802 } 803 804 // Append the region subtag if present. 805 if (aTag.Region().Present()) { 806 if (!aLocale.append('_') || !appendSubtag(aTag.Region())) { 807 return false; 808 } 809 } 810 811 // Zero-terminated for use with ICU. 812 return aLocale.append('\0'); 813 } 814 815 static ICUError ParserErrorToICUError(LocaleParser::ParserError aErr) { 816 using ParserError = LocaleParser::ParserError; 817 818 switch (aErr) { 819 case ParserError::NotParseable: 820 return ICUError::InternalError; 821 case ParserError::OutOfMemory: 822 return ICUError::OutOfMemory; 823 } 824 MOZ_CRASH("Unexpected parser error"); 825 } 826 827 static ICUError CanonicalizationErrorToICUError( 828 Locale::CanonicalizationError aErr) { 829 using CanonicalizationError = Locale::CanonicalizationError; 830 831 switch (aErr) { 832 case CanonicalizationError::DuplicateVariant: 833 case CanonicalizationError::InternalError: 834 return ICUError::InternalError; 835 case CanonicalizationError::OutOfMemory: 836 return ICUError::OutOfMemory; 837 } 838 MOZ_CRASH("Unexpected canonicalization error"); 839 } 840 841 // Assign the language, script, and region subtags from an ICU locale ID. 842 // 843 // ICU provides |uloc_getLanguage|, |uloc_getScript|, and |uloc_getCountry| to 844 // retrieve these subtags, but unfortunately these functions are rather slow, so 845 // we use our own implementation. 846 static ICUResult AssignFromLocaleId(LocaleId& aLocaleId, Locale& aTag) { 847 // Replace the ICU locale ID separator. 848 std::replace(aLocaleId.begin(), aLocaleId.end(), '_', '-'); 849 850 // ICU replaces "und" with the empty string, which means "und" becomes "" and 851 // "und-Latn" becomes "-Latn". Handle this case separately. 852 if (aLocaleId.empty() || aLocaleId[0] == '-') { 853 static constexpr auto und = MakeStringSpan("und"); 854 constexpr size_t length = und.size(); 855 856 // Insert "und" in front of the locale ID. 857 if (!aLocaleId.growBy(length)) { 858 return Err(ICUError::OutOfMemory); 859 } 860 memmove(aLocaleId.begin() + length, aLocaleId.begin(), aLocaleId.length()); 861 memmove(aLocaleId.begin(), und.data(), length); 862 } 863 864 // Retrieve the language, script, and region subtags from the locale ID 865 Locale localeTag; 866 MOZ_TRY(LocaleParser::TryParseBaseName(aLocaleId, localeTag) 867 .mapErr(ParserErrorToICUError)); 868 869 aTag.SetLanguage(localeTag.Language()); 870 aTag.SetScript(localeTag.Script()); 871 aTag.SetRegion(localeTag.Region()); 872 873 return Ok(); 874 } 875 876 template <decltype(uloc_addLikelySubtags) likelySubtagsFn> 877 static ICUResult CallLikelySubtags(const LocaleId& aLocaleId, 878 LocaleId& aResult) { 879 // Locale ID must be zero-terminated before passing it to ICU. 880 MOZ_ASSERT(aLocaleId.back() == '\0'); 881 MOZ_ASSERT(aResult.length() == 0); 882 883 // Ensure there's enough room for the result. 884 MOZ_ALWAYS_TRUE(aResult.resize(LocaleId::InlineLength)); 885 886 return FillBufferWithICUCall( 887 aResult, [&aLocaleId](char* chars, int32_t size, UErrorCode* status) { 888 return likelySubtagsFn(aLocaleId.begin(), chars, size, status); 889 }); 890 } 891 892 // The canonical way to compute the Unicode BCP 47 locale identifier with likely 893 // subtags is as follows: 894 // 895 // 1. Call uloc_forLanguageTag() to transform the locale identifer into an ICU 896 // locale ID. 897 // 2. Call uloc_addLikelySubtags() to add the likely subtags to the locale ID. 898 // 3. Call uloc_toLanguageTag() to transform the resulting locale ID back into 899 // a Unicode BCP 47 locale identifier. 900 // 901 // Since uloc_forLanguageTag() and uloc_toLanguageTag() are both kind of slow 902 // and we know, by construction, that the input Unicode BCP 47 locale identifier 903 // only contains valid language, script, and region subtags, we can avoid both 904 // calls if we implement them ourselves, see CreateLocaleForLikelySubtags() and 905 // AssignFromLocaleId(). (Where "slow" means about 50% of the execution time of 906 // |Intl.Locale.prototype.maximize|.) 907 static ICUResult LikelySubtags(LikelySubtags aLikelySubtags, Locale& aTag) { 908 // Return early if the input is already maximized/minimized. 909 if (HasLikelySubtags(aLikelySubtags, aTag)) { 910 return Ok(); 911 } 912 913 // Create the locale ID for the input argument. 914 LocaleId locale; 915 if (!CreateLocaleForLikelySubtags(aTag, locale)) { 916 return Err(ICUError::OutOfMemory); 917 } 918 919 // Either add or remove likely subtags to/from the locale ID. 920 LocaleId localeLikelySubtags; 921 if (aLikelySubtags == LikelySubtags::Add) { 922 MOZ_TRY( 923 CallLikelySubtags<uloc_addLikelySubtags>(locale, localeLikelySubtags)); 924 } else { 925 MOZ_TRY( 926 CallLikelySubtags<uloc_minimizeSubtags>(locale, localeLikelySubtags)); 927 } 928 929 // Assign the language, script, and region subtags from the locale ID. 930 MOZ_TRY(AssignFromLocaleId(localeLikelySubtags, aTag)); 931 932 // Update mappings in case ICU returned a non-canonical locale. 933 MOZ_TRY(aTag.CanonicalizeBaseName().mapErr(CanonicalizationErrorToICUError)); 934 935 return Ok(); 936 } 937 938 ICUResult Locale::AddLikelySubtags() { 939 return LikelySubtags(LikelySubtags::Add, *this); 940 } 941 942 ICUResult Locale::RemoveLikelySubtags() { 943 return LikelySubtags(LikelySubtags::Remove, *this); 944 } 945 946 UniqueChars Locale::DuplicateStringToUniqueChars(const char* aStr) { 947 size_t length = strlen(aStr) + 1; 948 auto duplicate = MakeUnique<char[]>(length); 949 memcpy(duplicate.get(), aStr, length); 950 return duplicate; 951 } 952 953 UniqueChars Locale::DuplicateStringToUniqueChars(Span<const char> aStr) { 954 size_t length = aStr.size(); 955 auto duplicate = MakeUnique<char[]>(length + 1); 956 memcpy(duplicate.get(), aStr.data(), length); 957 duplicate[length] = '\0'; 958 return duplicate; 959 } 960 961 size_t Locale::ToStringCapacity() const { 962 // This is a bit awkward, the buffer class currently does not support 963 // being resized, so we need to calculate the required size up front and 964 // reserve it all at once. 965 auto lengthSubtag = [](const auto& subtag) { 966 auto span = subtag.Span(); 967 MOZ_ASSERT(!span.empty()); 968 return span.size(); 969 }; 970 971 auto lengthSubtagZ = [](const char* subtag) { 972 size_t length = strlen(subtag); 973 MOZ_ASSERT(length > 0); 974 return length; 975 }; 976 977 auto lengthSubtags = [&lengthSubtag](const auto& subtags) { 978 size_t length = 0; 979 for (const auto& subtag : subtags) { 980 length += lengthSubtag(subtag) + 1; 981 } 982 return length; 983 }; 984 985 auto lengthSubtagsZ = [&lengthSubtagZ](const auto& subtags) { 986 size_t length = 0; 987 for (const auto& subtag : subtags) { 988 length += lengthSubtagZ(subtag.get()) + 1; 989 } 990 return length; 991 }; 992 993 // First calculate required capacity 994 size_t capacity = 0; 995 996 capacity += lengthSubtag(mLanguage); 997 998 if (mScript.Present()) { 999 capacity += lengthSubtag(mScript) + 1; 1000 } 1001 1002 if (mRegion.Present()) { 1003 capacity += lengthSubtag(mRegion) + 1; 1004 } 1005 1006 capacity += lengthSubtags(mVariants); 1007 1008 capacity += lengthSubtagsZ(mExtensions); 1009 1010 if (mPrivateUse.get()) { 1011 capacity += lengthSubtagZ(mPrivateUse.get()) + 1; 1012 } 1013 1014 return capacity; 1015 } 1016 1017 size_t Locale::ToStringAppend(char* aBuffer) const { 1018 // Current write position inside buffer. 1019 size_t offset = 0; 1020 1021 auto appendHyphen = [&offset, &aBuffer]() { 1022 aBuffer[offset] = '-'; 1023 offset += 1; 1024 }; 1025 1026 auto appendSubtag = [&offset, &aBuffer](const auto& subtag) { 1027 auto span = subtag.Span(); 1028 memcpy(aBuffer + offset, span.data(), span.size()); 1029 offset += span.size(); 1030 }; 1031 1032 auto appendSubtagZ = [&offset, &aBuffer](const char* subtag) { 1033 size_t length = strlen(subtag); 1034 memcpy(aBuffer + offset, subtag, length); 1035 offset += length; 1036 }; 1037 1038 auto appendSubtags = [&appendHyphen, &appendSubtag](const auto& subtags) { 1039 for (const auto& subtag : subtags) { 1040 appendHyphen(); 1041 appendSubtag(subtag); 1042 } 1043 }; 1044 1045 auto appendSubtagsZ = [&appendHyphen, &appendSubtagZ](const auto& subtags) { 1046 for (const auto& subtag : subtags) { 1047 appendHyphen(); 1048 appendSubtagZ(subtag.get()); 1049 } 1050 }; 1051 1052 // Append the language subtag. 1053 appendSubtag(mLanguage); 1054 1055 // Append the script subtag if present. 1056 if (mScript.Present()) { 1057 appendHyphen(); 1058 appendSubtag(mScript); 1059 } 1060 1061 // Append the region subtag if present. 1062 if (mRegion.Present()) { 1063 appendHyphen(); 1064 appendSubtag(mRegion); 1065 } 1066 1067 // Append the variant subtags if present. 1068 appendSubtags(mVariants); 1069 1070 // Append the extensions subtags if present. 1071 appendSubtagsZ(mExtensions); 1072 1073 // Append the private-use subtag if present. 1074 if (mPrivateUse.get()) { 1075 appendHyphen(); 1076 appendSubtagZ(mPrivateUse.get()); 1077 } 1078 1079 return offset; 1080 } 1081 1082 LocaleParser::Token LocaleParser::NextToken() { 1083 MOZ_ASSERT(mIndex <= mLength + 1, "called after 'None' token was read"); 1084 1085 TokenKind kind = TokenKind::None; 1086 size_t tokenLength = 0; 1087 for (size_t i = mIndex; i < mLength; i++) { 1088 // UTS 35, section 3.1. 1089 // alpha = [A-Z a-z] ; 1090 // digit = [0-9] ; 1091 char c = CharAt(i); 1092 if (IsAsciiAlpha(c)) { 1093 kind |= TokenKind::Alpha; 1094 } else if (IsAsciiDigit(c)) { 1095 kind |= TokenKind::Digit; 1096 } else if (c == '-' && i > mIndex && i + 1 < mLength) { 1097 break; 1098 } else { 1099 return {TokenKind::Error, 0, 0}; 1100 } 1101 tokenLength += 1; 1102 } 1103 1104 Token token{kind, mIndex, tokenLength}; 1105 mIndex += tokenLength + 1; 1106 return token; 1107 } 1108 1109 UniqueChars LocaleParser::Chars(size_t aIndex, size_t aLength) const { 1110 // Add +1 to null-terminate the string. 1111 auto chars = MakeUnique<char[]>(aLength + 1); 1112 char* dest = chars.get(); 1113 std::copy_n(mLocale + aIndex, aLength, dest); 1114 dest[aLength] = '\0'; 1115 return chars; 1116 } 1117 1118 // Parse the `unicode_language_id` production. 1119 // 1120 // unicode_language_id = unicode_language_subtag 1121 // (sep unicode_script_subtag)? 1122 // (sep unicode_region_subtag)? 1123 // (sep unicode_variant_subtag)* ; 1124 // 1125 // sep = "-" 1126 // 1127 // Note: Unicode CLDR locale identifier backward compatibility extensions 1128 // removed from `unicode_language_id`. 1129 // 1130 // |tok| is the current token from |ts|. 1131 // 1132 // All subtags will be added unaltered to |tag|, without canonicalizing their 1133 // case or, in the case of variant subtags, detecting and rejecting duplicate 1134 // variants. Users must subsequently |CanonicalizeBaseName| to perform these 1135 // actions. 1136 // 1137 // Do not use this function directly: use |ParseBaseName| or 1138 // |ParseTlangFromTransformExtension| instead. 1139 Result<Ok, LocaleParser::ParserError> LocaleParser::InternalParseBaseName( 1140 LocaleParser& aLocaleParser, Locale& aTag, Token& aTok) { 1141 if (aLocaleParser.IsLanguage(aTok)) { 1142 aLocaleParser.CopyChars(aTok, aTag.mLanguage); 1143 1144 aTok = aLocaleParser.NextToken(); 1145 } else { 1146 // The language subtag is mandatory. 1147 return Err(ParserError::NotParseable); 1148 } 1149 1150 if (aLocaleParser.IsScript(aTok)) { 1151 aLocaleParser.CopyChars(aTok, aTag.mScript); 1152 1153 aTok = aLocaleParser.NextToken(); 1154 } 1155 1156 if (aLocaleParser.IsRegion(aTok)) { 1157 aLocaleParser.CopyChars(aTok, aTag.mRegion); 1158 1159 aTok = aLocaleParser.NextToken(); 1160 } 1161 1162 auto& variants = aTag.mVariants; 1163 MOZ_ASSERT(variants.length() == 0); 1164 while (aLocaleParser.IsVariant(aTok)) { 1165 VariantSubtag variant{}; 1166 aLocaleParser.CopyChars(aTok, variant); 1167 if (!variants.append(variant)) { 1168 return Err(ParserError::OutOfMemory); 1169 } 1170 1171 aTok = aLocaleParser.NextToken(); 1172 } 1173 1174 return Ok(); 1175 } 1176 1177 Result<Ok, LocaleParser::ParserError> LocaleParser::TryParse( 1178 mozilla::Span<const char> aLocale, Locale& aTag) { 1179 // |aTag| must be a new, empty Locale. 1180 MOZ_ASSERT(aTag.Language().Missing()); 1181 MOZ_ASSERT(aTag.Script().Missing()); 1182 MOZ_ASSERT(aTag.Region().Missing()); 1183 MOZ_ASSERT(aTag.Variants().empty()); 1184 MOZ_ASSERT(aTag.Extensions().empty()); 1185 MOZ_ASSERT(aTag.PrivateUse().isNothing()); 1186 1187 // unicode_locale_id = unicode_language_id 1188 // extensions* 1189 // pu_extensions? ; 1190 1191 LocaleParser ts(aLocale); 1192 Token tok = ts.NextToken(); 1193 1194 MOZ_TRY(ParseBaseName(ts, aTag, tok)); 1195 1196 // extensions = unicode_locale_extensions 1197 // | transformed_extensions 1198 // | other_extensions ; 1199 1200 // Bit set of seen singletons. 1201 uint64_t seenSingletons = 0; 1202 1203 auto& extensions = aTag.mExtensions; 1204 while (ts.IsExtensionStart(tok)) { 1205 char singleton = ts.SingletonKey(tok); 1206 1207 // Reject the input if a duplicate singleton was found. 1208 uint64_t hash = 1ULL << (AsciiAlphanumericToNumber(singleton) + 1); 1209 if (seenSingletons & hash) { 1210 return Err(ParserError::NotParseable); 1211 } 1212 seenSingletons |= hash; 1213 1214 Token start = tok; 1215 tok = ts.NextToken(); 1216 1217 // We'll check for missing non-singleton subtags after this block by 1218 // comparing |startValue| with the then-current position. 1219 size_t startValue = tok.Index(); 1220 1221 if (singleton == 'u') { 1222 while (ts.IsUnicodeExtensionPart(tok)) { 1223 tok = ts.NextToken(); 1224 } 1225 } else if (singleton == 't') { 1226 // transformed_extensions = sep [tT] 1227 // ((sep tlang (sep tfield)*) 1228 // | (sep tfield)+) ; 1229 1230 // tlang = unicode_language_subtag 1231 // (sep unicode_script_subtag)? 1232 // (sep unicode_region_subtag)? 1233 // (sep unicode_variant_subtag)* ; 1234 if (ts.IsLanguage(tok)) { 1235 tok = ts.NextToken(); 1236 1237 if (ts.IsScript(tok)) { 1238 tok = ts.NextToken(); 1239 } 1240 1241 if (ts.IsRegion(tok)) { 1242 tok = ts.NextToken(); 1243 } 1244 1245 while (ts.IsVariant(tok)) { 1246 tok = ts.NextToken(); 1247 } 1248 } 1249 1250 // tfield = tkey tvalue; 1251 while (ts.IsTransformExtensionKey(tok)) { 1252 tok = ts.NextToken(); 1253 1254 size_t startTValue = tok.Index(); 1255 while (ts.IsTransformExtensionPart(tok)) { 1256 tok = ts.NextToken(); 1257 } 1258 1259 // `tfield` requires at least one `tvalue`. 1260 if (tok.Index() <= startTValue) { 1261 return Err(ParserError::NotParseable); 1262 } 1263 } 1264 } else { 1265 while (ts.IsOtherExtensionPart(tok)) { 1266 tok = ts.NextToken(); 1267 } 1268 } 1269 1270 // Singletons must be followed by a non-singleton subtag, "en-a-b" is not 1271 // allowed. 1272 if (tok.Index() <= startValue) { 1273 return Err(ParserError::NotParseable); 1274 } 1275 1276 UniqueChars extension = ts.Extension(start, tok); 1277 if (!extensions.append(std::move(extension))) { 1278 return Err(ParserError::OutOfMemory); 1279 } 1280 } 1281 1282 // Trailing `pu_extension` component of the `unicode_locale_id` production. 1283 if (ts.IsPrivateUseStart(tok)) { 1284 Token start = tok; 1285 tok = ts.NextToken(); 1286 1287 size_t startValue = tok.Index(); 1288 while (ts.IsPrivateUsePart(tok)) { 1289 tok = ts.NextToken(); 1290 } 1291 1292 // There must be at least one subtag after the "-x-". 1293 if (tok.Index() <= startValue) { 1294 return Err(ParserError::NotParseable); 1295 } 1296 1297 UniqueChars privateUse = ts.Extension(start, tok); 1298 aTag.mPrivateUse = std::move(privateUse); 1299 } 1300 1301 if (!tok.IsNone()) { 1302 return Err(ParserError::NotParseable); 1303 } 1304 1305 return Ok(); 1306 } 1307 1308 Result<Ok, LocaleParser::ParserError> LocaleParser::TryParseBaseName( 1309 Span<const char> aLocale, Locale& aTag) { 1310 // |aTag| must be a new, empty Locale. 1311 MOZ_ASSERT(aTag.Language().Missing()); 1312 MOZ_ASSERT(aTag.Script().Missing()); 1313 MOZ_ASSERT(aTag.Region().Missing()); 1314 MOZ_ASSERT(aTag.Variants().empty()); 1315 MOZ_ASSERT(aTag.Extensions().empty()); 1316 MOZ_ASSERT(aTag.PrivateUse().isNothing()); 1317 1318 LocaleParser ts(aLocale); 1319 Token tok = ts.NextToken(); 1320 1321 MOZ_TRY(ParseBaseName(ts, aTag, tok)); 1322 if (!tok.IsNone()) { 1323 return Err(ParserError::NotParseable); 1324 } 1325 1326 return Ok(); 1327 } 1328 1329 // Parse |aExtension|, which must be a valid `transformed_extensions` subtag, 1330 // and fill |aTag| and |aFields| from the `tlang` and `tfield` components. 1331 Result<Ok, LocaleParser::ParserError> LocaleParser::ParseTransformExtension( 1332 Span<const char> aExtension, Locale& aTag, TFieldVector& aFields) { 1333 LocaleParser ts(aExtension); 1334 Token tok = ts.NextToken(); 1335 1336 if (!ts.IsExtensionStart(tok) || ts.SingletonKey(tok) != 't') { 1337 return Err(ParserError::NotParseable); 1338 } 1339 1340 tok = ts.NextToken(); 1341 1342 if (tok.IsNone()) { 1343 return Err(ParserError::NotParseable); 1344 } 1345 1346 if (ts.IsLanguage(tok)) { 1347 // We're parsing a possible `tlang` in a known-valid transform extension, so 1348 // use the special-purpose function that takes advantage of this to compute 1349 // lowercased |tag| contents in an optimal manner. 1350 MOZ_TRY(ParseTlangInTransformExtension(ts, aTag, tok)); 1351 1352 // After `tlang` we must have a `tfield` and its `tkey`, or we're at the end 1353 // of the transform extension. 1354 MOZ_ASSERT(ts.IsTransformExtensionKey(tok) || tok.IsNone()); 1355 } else { 1356 // If there's no `tlang` subtag, at least one `tfield` must be present. 1357 MOZ_ASSERT(ts.IsTransformExtensionKey(tok)); 1358 } 1359 1360 // Trailing `tfield` subtags. (Any other trailing subtags are an error, 1361 // because we're guaranteed to only see a valid tranform extension here.) 1362 while (ts.IsTransformExtensionKey(tok)) { 1363 size_t begin = tok.Index(); 1364 tok = ts.NextToken(); 1365 1366 size_t startTValue = tok.Index(); 1367 while (ts.IsTransformExtensionPart(tok)) { 1368 tok = ts.NextToken(); 1369 } 1370 1371 // `tfield` requires at least one `tvalue`. 1372 if (tok.Index() <= startTValue) { 1373 return Err(ParserError::NotParseable); 1374 } 1375 1376 size_t length = tok.Index() - 1 - begin; 1377 if (!aFields.emplaceBack(begin, length)) { 1378 return Err(ParserError::OutOfMemory); 1379 } 1380 } 1381 1382 if (!tok.IsNone()) { 1383 return Err(ParserError::NotParseable); 1384 } 1385 1386 return Ok(); 1387 } 1388 1389 // Parse |aExtension|, which must be a valid `unicode_locale_extensions` subtag, 1390 // and fill |aAttributes| and |aKeywords| from the `attribute` and `keyword` 1391 // components. 1392 Result<Ok, LocaleParser::ParserError> LocaleParser::ParseUnicodeExtension( 1393 Span<const char> aExtension, AttributesVector& aAttributes, 1394 KeywordsVector& aKeywords) { 1395 LocaleParser ts(aExtension); 1396 Token tok = ts.NextToken(); 1397 1398 // unicode_locale_extensions = sep [uU] ((sep keyword)+ | 1399 // (sep attribute)+ (sep keyword)*) ; 1400 1401 if (!ts.IsExtensionStart(tok) || ts.SingletonKey(tok) != 'u') { 1402 return Err(ParserError::NotParseable); 1403 } 1404 1405 tok = ts.NextToken(); 1406 1407 if (tok.IsNone()) { 1408 return Err(ParserError::NotParseable); 1409 } 1410 1411 while (ts.IsUnicodeExtensionAttribute(tok)) { 1412 if (!aAttributes.emplaceBack(tok.Index(), tok.Length())) { 1413 return Err(ParserError::OutOfMemory); 1414 } 1415 1416 tok = ts.NextToken(); 1417 } 1418 1419 // keyword = key (sep type)? ; 1420 while (ts.IsUnicodeExtensionKey(tok)) { 1421 size_t begin = tok.Index(); 1422 tok = ts.NextToken(); 1423 1424 while (ts.IsUnicodeExtensionType(tok)) { 1425 tok = ts.NextToken(); 1426 } 1427 1428 if (tok.IsError()) { 1429 return Err(ParserError::NotParseable); 1430 } 1431 1432 size_t length = tok.Index() - 1 - begin; 1433 if (!aKeywords.emplaceBack(begin, length)) { 1434 return Err(ParserError::OutOfMemory); 1435 } 1436 } 1437 1438 if (!tok.IsNone()) { 1439 return Err(ParserError::NotParseable); 1440 } 1441 1442 return Ok(); 1443 } 1444 1445 Result<Ok, LocaleParser::ParserError> LocaleParser::CanParseUnicodeExtension( 1446 Span<const char> aExtension) { 1447 LocaleParser ts(aExtension); 1448 Token tok = ts.NextToken(); 1449 1450 // unicode_locale_extensions = sep [uU] ((sep keyword)+ | 1451 // (sep attribute)+ (sep keyword)*) ; 1452 1453 if (!ts.IsExtensionStart(tok) || ts.SingletonKey(tok) != 'u') { 1454 return Err(ParserError::NotParseable); 1455 } 1456 1457 tok = ts.NextToken(); 1458 1459 if (tok.IsNone()) { 1460 return Err(ParserError::NotParseable); 1461 } 1462 1463 while (ts.IsUnicodeExtensionAttribute(tok)) { 1464 tok = ts.NextToken(); 1465 } 1466 1467 // keyword = key (sep type)? ; 1468 while (ts.IsUnicodeExtensionKey(tok)) { 1469 tok = ts.NextToken(); 1470 1471 while (ts.IsUnicodeExtensionType(tok)) { 1472 tok = ts.NextToken(); 1473 } 1474 1475 if (tok.IsError()) { 1476 return Err(ParserError::NotParseable); 1477 } 1478 } 1479 1480 if (!tok.IsNone()) { 1481 return Err(ParserError::OutOfMemory); 1482 } 1483 1484 return Ok(); 1485 } 1486 1487 Result<Ok, LocaleParser::ParserError> 1488 LocaleParser::CanParseUnicodeExtensionType(Span<const char> aUnicodeType) { 1489 MOZ_ASSERT(!aUnicodeType.empty(), "caller must exclude empty strings"); 1490 1491 LocaleParser ts(aUnicodeType); 1492 Token tok = ts.NextToken(); 1493 1494 while (ts.IsUnicodeExtensionType(tok)) { 1495 tok = ts.NextToken(); 1496 } 1497 1498 if (!tok.IsNone()) { 1499 return Err(ParserError::NotParseable); 1500 } 1501 1502 return Ok(); 1503 } 1504 1505 } // namespace mozilla::intl