Locale.h (25513B)
1 /* This Source Code Form is subject to the terms of the Mozilla Public 2 * License, v. 2.0. If a copy of the MPL was not distributed with this 3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 4 5 /* 6 * Structured representation of Unicode locale IDs used with Intl functions. 7 * 8 * Spec: 9 * https://unicode.org/reports/tr35/tr35.html#Unicode_Language_and_Locale_Identifiers 10 */ 11 12 #ifndef intl_components_Locale_h 13 #define intl_components_Locale_h 14 15 #include "mozilla/Assertions.h" 16 #include "mozilla/intl/ICUError.h" 17 #include "mozilla/intl/ICU4CGlue.h" 18 #include "mozilla/Maybe.h" 19 #include "mozilla/Span.h" 20 #include "mozilla/TextUtils.h" 21 #include "mozilla/Try.h" 22 #include "mozilla/TypedEnumBits.h" 23 #include "mozilla/Vector.h" 24 25 #include <algorithm> 26 #include <stddef.h> 27 #include <stdint.h> 28 #include <string.h> 29 #include <utility> 30 31 #include "unicode/uloc.h" 32 33 namespace mozilla::intl { 34 35 /** 36 * Return true if |language| is a valid language subtag. 37 */ 38 template <typename CharT> 39 bool IsStructurallyValidLanguageTag(mozilla::Span<const CharT> aLanguage); 40 41 /** 42 * Return true if |script| is a valid script subtag. 43 */ 44 template <typename CharT> 45 bool IsStructurallyValidScriptTag(mozilla::Span<const CharT> aScript); 46 47 /** 48 * Return true if |region| is a valid region subtag. 49 */ 50 template <typename CharT> 51 bool IsStructurallyValidRegionTag(mozilla::Span<const CharT> aRegion); 52 53 /** 54 * Return true if |variant| is a valid variant subtag. 55 */ 56 template <typename CharT> 57 bool IsStructurallyValidVariantTag(mozilla::Span<const CharT> aVariant); 58 59 #ifdef DEBUG 60 /** 61 * Return true if |extension| is a valid Unicode extension subtag. 62 */ 63 bool IsStructurallyValidUnicodeExtensionTag( 64 mozilla::Span<const char> aExtension); 65 66 /** 67 * Return true if |privateUse| is a valid private-use subtag. 68 */ 69 bool IsStructurallyValidPrivateUseTag(mozilla::Span<const char> aPrivateUse); 70 71 #endif 72 73 template <typename CharT> 74 char AsciiToLowerCase(CharT aChar) { 75 MOZ_ASSERT(mozilla::IsAscii(aChar)); 76 return mozilla::IsAsciiUppercaseAlpha(aChar) ? (aChar + 0x20) : aChar; 77 } 78 79 template <typename CharT> 80 char AsciiToUpperCase(CharT aChar) { 81 MOZ_ASSERT(mozilla::IsAscii(aChar)); 82 return mozilla::IsAsciiLowercaseAlpha(aChar) ? (aChar - 0x20) : aChar; 83 } 84 85 template <typename CharT> 86 void AsciiToLowerCase(CharT* aChars, size_t aLength, char* aDest) { 87 char (&fn)(CharT) = AsciiToLowerCase; 88 std::transform(aChars, aChars + aLength, aDest, fn); 89 } 90 91 template <typename CharT> 92 void AsciiToUpperCase(CharT* aChars, size_t aLength, char* aDest) { 93 char (&fn)(CharT) = AsciiToUpperCase; 94 std::transform(aChars, aChars + aLength, aDest, fn); 95 } 96 97 template <typename CharT> 98 void AsciiToTitleCase(CharT* aChars, size_t aLength, char* aDest) { 99 if (aLength > 0) { 100 AsciiToUpperCase(aChars, 1, aDest); 101 AsciiToLowerCase(aChars + 1, aLength - 1, aDest + 1); 102 } 103 } 104 105 // Constants for language subtag lengths. 106 namespace LanguageTagLimits { 107 108 // unicode_language_subtag = alpha{2,3} | alpha{5,8} ; 109 static constexpr size_t LanguageLength = 8; 110 111 // unicode_script_subtag = alpha{4} ; 112 static constexpr size_t ScriptLength = 4; 113 114 // unicode_region_subtag = (alpha{2} | digit{3}) ; 115 static constexpr size_t RegionLength = 3; 116 static constexpr size_t AlphaRegionLength = 2; 117 static constexpr size_t DigitRegionLength = 3; 118 119 // unicode_variant_subtag = (alphanum{5,8} | digit alphanum{3}) ; 120 static constexpr size_t VariantLength = 8; 121 122 // key = alphanum alpha ; 123 static constexpr size_t UnicodeKeyLength = 2; 124 125 // tkey = alpha digit ; 126 static constexpr size_t TransformKeyLength = 2; 127 128 } // namespace LanguageTagLimits 129 130 // Fixed size language subtag which is stored inline in Locale. 131 template <size_t SubtagLength> 132 class LanguageTagSubtag final { 133 uint8_t mLength = 0; 134 char mChars[SubtagLength] = {}; // zero initialize 135 136 public: 137 LanguageTagSubtag() = default; 138 139 LanguageTagSubtag(const LanguageTagSubtag& aOther) { 140 std::copy_n(aOther.mChars, SubtagLength, mChars); 141 mLength = aOther.mLength; 142 } 143 144 template <typename CharT> 145 explicit LanguageTagSubtag(mozilla::Span<const CharT> str) { 146 Set(str); 147 } 148 149 LanguageTagSubtag& operator=(const LanguageTagSubtag& aOther) { 150 std::copy_n(aOther.mChars, SubtagLength, mChars); 151 mLength = aOther.mLength; 152 return *this; 153 } 154 155 size_t Length() const { return mLength; } 156 bool Missing() const { return mLength == 0; } 157 bool Present() const { return mLength > 0; } 158 159 mozilla::Span<const char> Span() const { return {mChars, mLength}; } 160 161 template <typename CharT> 162 void Set(mozilla::Span<const CharT> str) { 163 MOZ_ASSERT(str.size() <= SubtagLength); 164 std::copy_n(str.data(), str.size(), mChars); 165 mLength = str.size(); 166 } 167 168 // The toXYZCase() methods are using |SubtagLength| instead of |length()|, 169 // because current compilers (tested GCC and Clang) can't infer the maximum 170 // string length - even when using hints like |std::min| - and instead are 171 // emitting SIMD optimized code. Using a fixed sized length avoids emitting 172 // the SIMD code. (Emitting SIMD code doesn't make sense here, because the 173 // SIMD code only kicks in for long strings.) A fixed length will 174 // additionally ensure the compiler unrolls the loop in the case conversion 175 // code. 176 177 void ToLowerCase() { AsciiToLowerCase(mChars, SubtagLength, mChars); } 178 179 void ToUpperCase() { AsciiToUpperCase(mChars, SubtagLength, mChars); } 180 181 void ToTitleCase() { AsciiToTitleCase(mChars, SubtagLength, mChars); } 182 183 template <size_t N> 184 bool EqualTo(const char (&str)[N]) const { 185 static_assert(N - 1 <= SubtagLength, 186 "subtag literals must not exceed the maximum subtag length"); 187 188 return mLength == N - 1 && memcmp(mChars, str, N - 1) == 0; 189 } 190 }; 191 192 using LanguageSubtag = LanguageTagSubtag<LanguageTagLimits::LanguageLength>; 193 using ScriptSubtag = LanguageTagSubtag<LanguageTagLimits::ScriptLength>; 194 using RegionSubtag = LanguageTagSubtag<LanguageTagLimits::RegionLength>; 195 using VariantSubtag = LanguageTagSubtag<LanguageTagLimits::VariantLength>; 196 197 using Latin1Char = unsigned char; 198 using UniqueChars = UniquePtr<char[]>; 199 200 /** 201 * Object representing a Unicode BCP 47 locale identifier. 202 * 203 * All subtags are already in canonicalized case. 204 */ 205 class MOZ_STACK_CLASS Locale final { 206 public: 207 using VariantsVector = Vector<VariantSubtag, 2>; 208 using ExtensionsVector = Vector<UniqueChars, 2>; 209 210 private: 211 LanguageSubtag mLanguage = {}; 212 ScriptSubtag mScript = {}; 213 RegionSubtag mRegion = {}; 214 215 VariantsVector mVariants; 216 ExtensionsVector mExtensions; 217 UniqueChars mPrivateUse = nullptr; 218 219 friend class LocaleParser; 220 221 public: 222 enum class CanonicalizationError : uint8_t { 223 DuplicateVariant, 224 InternalError, 225 OutOfMemory, 226 }; 227 228 private: 229 Result<Ok, CanonicalizationError> CanonicalizeUnicodeExtension( 230 UniqueChars& unicodeExtension); 231 232 Result<Ok, CanonicalizationError> CanonicalizeTransformExtension( 233 UniqueChars& transformExtension); 234 235 public: 236 static bool LanguageMapping(LanguageSubtag& aLanguage); 237 static bool ComplexLanguageMapping(const LanguageSubtag& aLanguage); 238 239 private: 240 static bool ScriptMapping(ScriptSubtag& aScript); 241 static bool RegionMapping(RegionSubtag& aRegion); 242 static bool ComplexRegionMapping(const RegionSubtag& aRegion); 243 244 void PerformComplexLanguageMappings(); 245 void PerformComplexRegionMappings(); 246 [[nodiscard]] bool PerformVariantMappings(); 247 248 [[nodiscard]] bool UpdateLegacyMappings(); 249 250 static bool SignLanguageMapping(LanguageSubtag& aLanguage, 251 const RegionSubtag& aRegion); 252 253 static const char* ReplaceTransformExtensionType( 254 mozilla::Span<const char> aKey, mozilla::Span<const char> aType); 255 256 static mozilla::Span<const char> ToSpan(const UniqueChars& aChars) { 257 return MakeStringSpan(aChars.get()); 258 } 259 260 template <size_t N> 261 static mozilla::Span<const char> ToSpan(const LanguageTagSubtag<N>& aSubtag) { 262 return aSubtag.Span(); 263 } 264 265 public: 266 /** 267 * Given a Unicode key and type, return the null-terminated preferred 268 * replacement for that type if there is one, or null if there is none, e.g. 269 * in effect 270 * |ReplaceUnicodeExtensionType("ca", "islamicc") == "islamic-civil"| 271 * and 272 * |ReplaceUnicodeExtensionType("ca", "islamic-civil") == nullptr|. 273 */ 274 static const char* ReplaceUnicodeExtensionType( 275 mozilla::Span<const char> aKey, mozilla::Span<const char> aType); 276 277 public: 278 Locale() = default; 279 Locale(const Locale&) = delete; 280 Locale& operator=(const Locale&) = delete; 281 Locale(Locale&&) = default; 282 Locale& operator=(Locale&&) = default; 283 284 template <class Vec> 285 class SubtagIterator { 286 using Iter = decltype(std::declval<const Vec>().begin()); 287 288 Iter mIter; 289 290 public: 291 explicit SubtagIterator(Iter iter) : mIter(iter) {} 292 293 // std::iterator traits. 294 using iterator_category = std::input_iterator_tag; 295 using value_type = Span<const char>; 296 using difference_type = ptrdiff_t; 297 using pointer = value_type*; 298 using reference = value_type&; 299 300 SubtagIterator& operator++() { 301 mIter++; 302 return *this; 303 } 304 305 SubtagIterator operator++(int) { 306 SubtagIterator result = *this; 307 ++(*this); 308 return result; 309 } 310 311 bool operator==(const SubtagIterator& aOther) const { 312 return mIter == aOther.mIter; 313 } 314 315 bool operator!=(const SubtagIterator& aOther) const { 316 return !(*this == aOther); 317 } 318 319 value_type operator*() const { return ToSpan(*mIter); } 320 }; 321 322 template <typename T, size_t N> 323 class SubtagEnumeration { 324 using Vec = Vector<T, N>; 325 326 const Vec& mVector; 327 328 public: 329 explicit SubtagEnumeration(const Vec& aVector) : mVector(aVector) {} 330 331 size_t length() const { return mVector.length(); } 332 bool empty() const { return mVector.empty(); } 333 334 auto begin() const { return SubtagIterator<Vec>(mVector.begin()); } 335 auto end() const { return SubtagIterator<Vec>(mVector.end()); } 336 337 Span<const char> operator[](size_t aIndex) const { 338 return ToSpan(mVector[aIndex]); 339 } 340 }; 341 342 const LanguageSubtag& Language() const { return mLanguage; } 343 const ScriptSubtag& Script() const { return mScript; } 344 const RegionSubtag& Region() const { return mRegion; } 345 auto Variants() const { return SubtagEnumeration(mVariants); } 346 auto Extensions() const { return SubtagEnumeration(mExtensions); } 347 Maybe<Span<const char>> PrivateUse() const { 348 if (const char* p = mPrivateUse.get()) { 349 return Some(MakeStringSpan(p)); 350 } 351 return Nothing(); 352 } 353 354 /** 355 * Return the Unicode extension subtag or Nothing if not present. 356 */ 357 Maybe<Span<const char>> GetUnicodeExtension() const; 358 359 private: 360 ptrdiff_t UnicodeExtensionIndex() const; 361 362 public: 363 /** 364 * Set the language subtag. The input must be a valid language subtag. 365 */ 366 template <size_t N> 367 void SetLanguage(const char (&aLanguage)[N]) { 368 mozilla::Span<const char> span(aLanguage, N - 1); 369 MOZ_ASSERT(IsStructurallyValidLanguageTag(span)); 370 mLanguage.Set(span); 371 } 372 373 /** 374 * Set the language subtag. The input must be a valid language subtag. 375 */ 376 void SetLanguage(const LanguageSubtag& aLanguage) { 377 MOZ_ASSERT(IsStructurallyValidLanguageTag(aLanguage.Span())); 378 mLanguage.Set(aLanguage.Span()); 379 } 380 381 /** 382 * Set the script subtag. The input must be a valid script subtag. 383 */ 384 template <size_t N> 385 void SetScript(const char (&aScript)[N]) { 386 mozilla::Span<const char> span(aScript, N - 1); 387 MOZ_ASSERT(IsStructurallyValidScriptTag(span)); 388 mScript.Set(span); 389 } 390 391 /** 392 * Set the script subtag. The input must be a valid script subtag or the empty 393 * string. 394 */ 395 void SetScript(const ScriptSubtag& aScript) { 396 MOZ_ASSERT(aScript.Missing() || 397 IsStructurallyValidScriptTag(aScript.Span())); 398 mScript.Set(aScript.Span()); 399 } 400 401 /** 402 * Set the region subtag. The input must be a valid region subtag. 403 */ 404 template <size_t N> 405 void SetRegion(const char (&aRegion)[N]) { 406 mozilla::Span<const char> span(aRegion, N - 1); 407 MOZ_ASSERT(IsStructurallyValidRegionTag(span)); 408 mRegion.Set(span); 409 } 410 411 /** 412 * Set the region subtag. The input must be a valid region subtag or the empty 413 * empty string. 414 */ 415 void SetRegion(const RegionSubtag& aRegion) { 416 MOZ_ASSERT(aRegion.Missing() || 417 IsStructurallyValidRegionTag(aRegion.Span())); 418 mRegion.Set(aRegion.Span()); 419 } 420 421 /** 422 * Set the variant subtags. Each element must be a valid variant subtag. 423 */ 424 void SetVariants(VariantsVector&& aVariants) { 425 MOZ_ASSERT(std::all_of( 426 aVariants.begin(), aVariants.end(), [](const auto& variant) { 427 return IsStructurallyValidVariantTag(variant.Span()); 428 })); 429 mVariants = std::move(aVariants); 430 } 431 432 /** 433 * Remove all variant subtags. 434 */ 435 void ClearVariants() { mVariants.clearAndFree(); } 436 437 /** 438 * Set the Unicode extension subtag. The input must be a valid Unicode 439 * extension subtag. 440 */ 441 ICUResult SetUnicodeExtension(Span<const char> aExtension); 442 443 /** 444 * Remove any Unicode extension subtag if present. 445 */ 446 void ClearUnicodeExtension(); 447 448 /** Canonicalize the base-name (language, script, region, variant) subtags. */ 449 Result<Ok, CanonicalizationError> CanonicalizeBaseName(); 450 451 /** 452 * Canonicalize all extension subtags. 453 */ 454 Result<Ok, CanonicalizationError> CanonicalizeExtensions(); 455 456 /** 457 * Canonicalizes the given structurally valid Unicode BCP 47 locale 458 * identifier, including regularized case of subtags. For example, the 459 * locale Zh-haNS-bu-variant2-Variant1-u-ca-chinese-t-Zh-laTN-x-PRIVATE, 460 * where 461 * 462 * Zh ; 2*3ALPHA 463 * -haNS ; ["-" script] 464 * -bu ; ["-" region] 465 * -variant2 ; *("-" variant) 466 * -Variant1 467 * -u-ca-chinese ; *("-" extension) 468 * -t-Zh-laTN 469 * -x-PRIVATE ; ["-" privateuse] 470 * 471 * becomes zh-Hans-MM-variant1-variant2-t-zh-latn-u-ca-chinese-x-private 472 * 473 * Spec: ECMAScript Internationalization API Specification, 6.2.3. 474 */ 475 Result<Ok, CanonicalizationError> Canonicalize() { 476 MOZ_TRY(CanonicalizeBaseName()); 477 return CanonicalizeExtensions(); 478 } 479 480 /** 481 * Fill the buffer with a string representation of the locale. 482 */ 483 template <typename B> 484 ICUResult ToString(B& aBuffer) const { 485 static_assert(std::is_same_v<typename B::CharType, char>); 486 487 size_t capacity = ToStringCapacity(); 488 489 // Attempt to reserve needed capacity 490 if (!aBuffer.reserve(capacity)) { 491 return Err(ICUError::OutOfMemory); 492 } 493 494 size_t offset = ToStringAppend(aBuffer.data()); 495 496 MOZ_ASSERT(capacity == offset); 497 aBuffer.written(offset); 498 499 return Ok(); 500 } 501 502 /** 503 * Add likely-subtags to the locale. 504 * 505 * Spec: <https://www.unicode.org/reports/tr35/#Likely_Subtags> 506 */ 507 ICUResult AddLikelySubtags(); 508 509 /** 510 * Remove likely-subtags from the locale. 511 * 512 * Spec: <https://www.unicode.org/reports/tr35/#Likely_Subtags> 513 */ 514 ICUResult RemoveLikelySubtags(); 515 516 /** 517 * Returns the default locale as an ICU locale identifier. The returned string 518 * is NOT a valid BCP 47 locale! 519 * 520 * Also see <https://unicode-org.github.io/icu/userguide/locale>. 521 */ 522 static const char* GetDefaultLocale() { return uloc_getDefault(); } 523 524 /** 525 * Returns an iterator over all supported locales. 526 * 527 * The returned strings are ICU locale identifiers and NOT BCP 47 language 528 * tags. 529 * 530 * Also see <https://unicode-org.github.io/icu/userguide/locale>. 531 */ 532 static auto GetAvailableLocales() { 533 return AvailableLocalesEnumeration<uloc_countAvailable, 534 uloc_getAvailable>(); 535 } 536 537 private: 538 static UniqueChars DuplicateStringToUniqueChars(const char* aStr); 539 static UniqueChars DuplicateStringToUniqueChars(Span<const char> aStr); 540 size_t ToStringCapacity() const; 541 size_t ToStringAppend(char* aBuffer) const; 542 }; 543 544 /** 545 * Parser for Unicode BCP 47 locale identifiers. 546 * 547 * <https://unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers> 548 */ 549 class MOZ_STACK_CLASS LocaleParser final { 550 public: 551 enum class ParserError : uint8_t { 552 // Input was not parseable as a locale, subtag or extension. 553 NotParseable, 554 // Unable to allocate memory for the parser to operate. 555 OutOfMemory, 556 }; 557 558 // Exposed as |public| for |MOZ_MAKE_ENUM_CLASS_BITWISE_OPERATORS|. 559 enum class TokenKind : uint8_t { 560 None = 0b000, 561 Alpha = 0b001, 562 Digit = 0b010, 563 AlphaDigit = 0b011, 564 Error = 0b100 565 }; 566 567 private: 568 class Token final { 569 size_t mIndex; 570 size_t mLength; 571 TokenKind mKind; 572 573 public: 574 Token(TokenKind aKind, size_t aIndex, size_t aLength) 575 : mIndex(aIndex), mLength(aLength), mKind(aKind) {} 576 577 TokenKind Kind() const { return mKind; } 578 size_t Index() const { return mIndex; } 579 size_t Length() const { return mLength; } 580 581 bool IsError() const { return mKind == TokenKind::Error; } 582 bool IsNone() const { return mKind == TokenKind::None; } 583 bool IsAlpha() const { return mKind == TokenKind::Alpha; } 584 bool IsDigit() const { return mKind == TokenKind::Digit; } 585 bool IsAlphaDigit() const { return mKind == TokenKind::AlphaDigit; } 586 }; 587 588 const char* mLocale; 589 size_t mLength; 590 size_t mIndex = 0; 591 592 explicit LocaleParser(Span<const char> aLocale) 593 : mLocale(aLocale.data()), mLength(aLocale.size()) {} 594 595 char CharAt(size_t aIndex) const { return mLocale[aIndex]; } 596 597 // Copy the token characters into |subtag|. 598 template <size_t N> 599 void CopyChars(const Token& aTok, LanguageTagSubtag<N>& aSubtag) const { 600 aSubtag.Set(mozilla::Span(mLocale + aTok.Index(), aTok.Length())); 601 } 602 603 // Create a string copy of |length| characters starting at |index|. 604 UniqueChars Chars(size_t aIndex, size_t aLength) const; 605 606 // Create a string copy of the token characters. 607 UniqueChars Chars(const Token& aTok) const { 608 return Chars(aTok.Index(), aTok.Length()); 609 } 610 611 UniqueChars Extension(const Token& aStart, const Token& aEnd) const { 612 MOZ_ASSERT(aStart.Index() < aEnd.Index()); 613 614 size_t length = aEnd.Index() - 1 - aStart.Index(); 615 return Chars(aStart.Index(), length); 616 } 617 618 Token NextToken(); 619 620 // unicode_language_subtag = alpha{2,3} | alpha{5,8} ; 621 // 622 // Four character language subtags are not allowed in Unicode BCP 47 locale 623 // identifiers. Also see the comparison to Unicode CLDR locale identifiers in 624 // <https://unicode.org/reports/tr35/#BCP_47_Conformance>. 625 bool IsLanguage(const Token& aTok) const { 626 return aTok.IsAlpha() && ((2 <= aTok.Length() && aTok.Length() <= 3) || 627 (5 <= aTok.Length() && aTok.Length() <= 8)); 628 } 629 630 // unicode_script_subtag = alpha{4} ; 631 bool IsScript(const Token& aTok) const { 632 return aTok.IsAlpha() && aTok.Length() == 4; 633 } 634 635 // unicode_region_subtag = (alpha{2} | digit{3}) ; 636 bool IsRegion(const Token& aTok) const { 637 return (aTok.IsAlpha() && aTok.Length() == 2) || 638 (aTok.IsDigit() && aTok.Length() == 3); 639 } 640 641 // unicode_variant_subtag = (alphanum{5,8} | digit alphanum{3}) ; 642 bool IsVariant(const Token& aTok) const { 643 return (5 <= aTok.Length() && aTok.Length() <= 8) || 644 (aTok.Length() == 4 && mozilla::IsAsciiDigit(CharAt(aTok.Index()))); 645 } 646 647 // Returns the code unit of the first character at the given singleton token. 648 // Always returns the lower case form of an alphabetical character. 649 char SingletonKey(const Token& aTok) const { 650 MOZ_ASSERT(aTok.Length() == 1); 651 return AsciiToLowerCase(CharAt(aTok.Index())); 652 } 653 654 // extensions = unicode_locale_extensions | 655 // transformed_extensions | 656 // other_extensions ; 657 // 658 // unicode_locale_extensions = sep [uU] ((sep keyword)+ | 659 // (sep attribute)+ (sep keyword)*) ; 660 // 661 // transformed_extensions = sep [tT] ((sep tlang (sep tfield)*) | 662 // (sep tfield)+) ; 663 // 664 // other_extensions = sep [alphanum-[tTuUxX]] (sep alphanum{2,8})+ ; 665 bool IsExtensionStart(const Token& aTok) const { 666 return aTok.Length() == 1 && SingletonKey(aTok) != 'x'; 667 } 668 669 // other_extensions = sep [alphanum-[tTuUxX]] (sep alphanum{2,8})+ ; 670 bool IsOtherExtensionPart(const Token& aTok) const { 671 return 2 <= aTok.Length() && aTok.Length() <= 8; 672 } 673 674 // unicode_locale_extensions = sep [uU] ((sep keyword)+ | 675 // (sep attribute)+ (sep keyword)*) ; 676 // keyword = key (sep type)? ; 677 bool IsUnicodeExtensionPart(const Token& aTok) const { 678 return IsUnicodeExtensionKey(aTok) || IsUnicodeExtensionType(aTok) || 679 IsUnicodeExtensionAttribute(aTok); 680 } 681 682 // attribute = alphanum{3,8} ; 683 bool IsUnicodeExtensionAttribute(const Token& aTok) const { 684 return 3 <= aTok.Length() && aTok.Length() <= 8; 685 } 686 687 // key = alphanum alpha ; 688 bool IsUnicodeExtensionKey(const Token& aTok) const { 689 return aTok.Length() == 2 && 690 mozilla::IsAsciiAlpha(CharAt(aTok.Index() + 1)); 691 } 692 693 // type = alphanum{3,8} (sep alphanum{3,8})* ; 694 bool IsUnicodeExtensionType(const Token& aTok) const { 695 return 3 <= aTok.Length() && aTok.Length() <= 8; 696 } 697 698 // tkey = alpha digit ; 699 bool IsTransformExtensionKey(const Token& aTok) const { 700 return aTok.Length() == 2 && mozilla::IsAsciiAlpha(CharAt(aTok.Index())) && 701 mozilla::IsAsciiDigit(CharAt(aTok.Index() + 1)); 702 } 703 704 // tvalue = (sep alphanum{3,8})+ ; 705 bool IsTransformExtensionPart(const Token& aTok) const { 706 return 3 <= aTok.Length() && aTok.Length() <= 8; 707 } 708 709 // pu_extensions = sep [xX] (sep alphanum{1,8})+ ; 710 bool IsPrivateUseStart(const Token& aTok) const { 711 return aTok.Length() == 1 && SingletonKey(aTok) == 'x'; 712 } 713 714 // pu_extensions = sep [xX] (sep alphanum{1,8})+ ; 715 bool IsPrivateUsePart(const Token& aTok) const { 716 return 1 <= aTok.Length() && aTok.Length() <= 8; 717 } 718 719 // Helper function for use in |ParseBaseName| and 720 // |ParseTlangInTransformExtension|. Do not use this directly! 721 static Result<Ok, ParserError> InternalParseBaseName( 722 LocaleParser& aLocaleParser, Locale& aTag, Token& aTok); 723 724 // Parse the `unicode_language_id` production, i.e. the 725 // language/script/region/variants portion of a locale, into |aTag|. 726 // |aTok| must be the current token. 727 static Result<Ok, ParserError> ParseBaseName(LocaleParser& aLocaleParser, 728 Locale& aTag, Token& aTok) { 729 return InternalParseBaseName(aLocaleParser, aTag, aTok); 730 } 731 732 // Parse the `tlang` production within a parsed 't' transform extension. 733 // The precise requirements for "previously parsed" are: 734 // 735 // * the input begins from current token |tok| with a valid `tlang` 736 // * the `tlang` is wholly lowercase (*not* canonical case) 737 // * variant subtags in the `tlang` may contain duplicates and be 738 // unordered 739 // 740 // Return an error on internal failure. Otherwise, return a success value. If 741 // there was no `tlang`, then |tag.language().missing()|. But if there was a 742 // `tlang`, then |tag| is filled with subtags exactly as they appeared in the 743 // parse input. 744 static Result<Ok, ParserError> ParseTlangInTransformExtension( 745 LocaleParser& aLocaleParser, Locale& aTag, Token& aTok) { 746 MOZ_ASSERT(aLocaleParser.IsLanguage(aTok)); 747 return InternalParseBaseName(aLocaleParser, aTag, aTok); 748 } 749 750 friend class Locale; 751 752 class Range final { 753 size_t mBegin; 754 size_t mLength; 755 756 public: 757 Range(size_t aBegin, size_t aLength) : mBegin(aBegin), mLength(aLength) {} 758 759 size_t Begin() const { return mBegin; } 760 size_t Length() const { return mLength; } 761 }; 762 763 using TFieldVector = Vector<Range, 8>; 764 using AttributesVector = Vector<Range, 8>; 765 using KeywordsVector = Vector<Range, 8>; 766 767 // Parse |extension|, which must be a validated, fully lowercase 768 // `transformed_extensions` subtag, and fill |tag| and |fields| from the 769 // `tlang` and `tfield` components. Data in |tag| is lowercase, consistent 770 // with |extension|. 771 static Result<Ok, ParserError> ParseTransformExtension( 772 mozilla::Span<const char> aExtension, Locale& aTag, 773 TFieldVector& aFields); 774 775 // Parse |extension|, which must be a validated, fully lowercase 776 // `unicode_locale_extensions` subtag, and fill |attributes| and |keywords| 777 // from the `attribute` and `keyword` components. 778 static Result<Ok, ParserError> ParseUnicodeExtension( 779 mozilla::Span<const char> aExtension, AttributesVector& aAttributes, 780 KeywordsVector& aKeywords); 781 782 public: 783 // Parse the input string as a locale. 784 // 785 // NOTE: |aTag| must be a new, empty Locale. 786 static Result<Ok, ParserError> TryParse(Span<const char> aLocale, 787 Locale& aTag); 788 789 // Parse the input string as the base-name parts (language, script, region, 790 // variants) of a locale. 791 // 792 // NOTE: |aTag| must be a new, empty Locale. 793 static Result<Ok, ParserError> TryParseBaseName(Span<const char> aLocale, 794 Locale& aTag); 795 796 // Return Ok() iff |extension| can be parsed as a Unicode extension subtag. 797 static Result<Ok, ParserError> CanParseUnicodeExtension( 798 Span<const char> aExtension); 799 800 // Return Ok() iff |unicodeType| can be parsed as a Unicode extension type. 801 static Result<Ok, ParserError> CanParseUnicodeExtensionType( 802 Span<const char> aUnicodeType); 803 }; 804 805 MOZ_MAKE_ENUM_CLASS_BITWISE_OPERATORS(LocaleParser::TokenKind) 806 807 } // namespace mozilla::intl 808 809 #endif /* intl_components_Locale_h */