[ tor-browser ].git.dasho

Locale.h (25513B)
      1 /* This Source Code Form is subject to the terms of the Mozilla Public
      2 * License, v. 2.0. If a copy of the MPL was not distributed with this
      3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      4 
      5 /*
      6 * Structured representation of Unicode locale IDs used with Intl functions.
      7 *
      8 * Spec:
      9 * https://unicode.org/reports/tr35/tr35.html#Unicode_Language_and_Locale_Identifiers
     10 */
     11 
     12 #ifndef intl_components_Locale_h
     13 #define intl_components_Locale_h
     14 
     15 #include "mozilla/Assertions.h"
     16 #include "mozilla/intl/ICUError.h"
     17 #include "mozilla/intl/ICU4CGlue.h"
     18 #include "mozilla/Maybe.h"
     19 #include "mozilla/Span.h"
     20 #include "mozilla/TextUtils.h"
     21 #include "mozilla/Try.h"
     22 #include "mozilla/TypedEnumBits.h"
     23 #include "mozilla/Vector.h"
     24 
     25 #include <algorithm>
     26 #include <stddef.h>
     27 #include <stdint.h>
     28 #include <string.h>
     29 #include <utility>
     30 
     31 #include "unicode/uloc.h"
     32 
     33 namespace mozilla::intl {
     34 
     35 /**
     36 * Return true if |language| is a valid language subtag.
     37 */
     38 template <typename CharT>
     39 bool IsStructurallyValidLanguageTag(mozilla::Span<const CharT> aLanguage);
     40 
     41 /**
     42 * Return true if |script| is a valid script subtag.
     43 */
     44 template <typename CharT>
     45 bool IsStructurallyValidScriptTag(mozilla::Span<const CharT> aScript);
     46 
     47 /**
     48 * Return true if |region| is a valid region subtag.
     49 */
     50 template <typename CharT>
     51 bool IsStructurallyValidRegionTag(mozilla::Span<const CharT> aRegion);
     52 
     53 /**
     54 * Return true if |variant| is a valid variant subtag.
     55 */
     56 template <typename CharT>
     57 bool IsStructurallyValidVariantTag(mozilla::Span<const CharT> aVariant);
     58 
     59 #ifdef DEBUG
     60 /**
     61 * Return true if |extension| is a valid Unicode extension subtag.
     62 */
     63 bool IsStructurallyValidUnicodeExtensionTag(
     64    mozilla::Span<const char> aExtension);
     65 
     66 /**
     67 * Return true if |privateUse| is a valid private-use subtag.
     68 */
     69 bool IsStructurallyValidPrivateUseTag(mozilla::Span<const char> aPrivateUse);
     70 
     71 #endif
     72 
     73 template <typename CharT>
     74 char AsciiToLowerCase(CharT aChar) {
     75  MOZ_ASSERT(mozilla::IsAscii(aChar));
     76  return mozilla::IsAsciiUppercaseAlpha(aChar) ? (aChar + 0x20) : aChar;
     77 }
     78 
     79 template <typename CharT>
     80 char AsciiToUpperCase(CharT aChar) {
     81  MOZ_ASSERT(mozilla::IsAscii(aChar));
     82  return mozilla::IsAsciiLowercaseAlpha(aChar) ? (aChar - 0x20) : aChar;
     83 }
     84 
     85 template <typename CharT>
     86 void AsciiToLowerCase(CharT* aChars, size_t aLength, char* aDest) {
     87  char (&fn)(CharT) = AsciiToLowerCase;
     88  std::transform(aChars, aChars + aLength, aDest, fn);
     89 }
     90 
     91 template <typename CharT>
     92 void AsciiToUpperCase(CharT* aChars, size_t aLength, char* aDest) {
     93  char (&fn)(CharT) = AsciiToUpperCase;
     94  std::transform(aChars, aChars + aLength, aDest, fn);
     95 }
     96 
     97 template <typename CharT>
     98 void AsciiToTitleCase(CharT* aChars, size_t aLength, char* aDest) {
     99  if (aLength > 0) {
    100    AsciiToUpperCase(aChars, 1, aDest);
    101    AsciiToLowerCase(aChars + 1, aLength - 1, aDest + 1);
    102  }
    103 }
    104 
    105 // Constants for language subtag lengths.
    106 namespace LanguageTagLimits {
    107 
    108 // unicode_language_subtag = alpha{2,3} | alpha{5,8} ;
    109 static constexpr size_t LanguageLength = 8;
    110 
    111 // unicode_script_subtag = alpha{4} ;
    112 static constexpr size_t ScriptLength = 4;
    113 
    114 // unicode_region_subtag = (alpha{2} | digit{3}) ;
    115 static constexpr size_t RegionLength = 3;
    116 static constexpr size_t AlphaRegionLength = 2;
    117 static constexpr size_t DigitRegionLength = 3;
    118 
    119 // unicode_variant_subtag = (alphanum{5,8} | digit alphanum{3}) ;
    120 static constexpr size_t VariantLength = 8;
    121 
    122 // key = alphanum alpha ;
    123 static constexpr size_t UnicodeKeyLength = 2;
    124 
    125 // tkey = alpha digit ;
    126 static constexpr size_t TransformKeyLength = 2;
    127 
    128 }  // namespace LanguageTagLimits
    129 
    130 // Fixed size language subtag which is stored inline in Locale.
    131 template <size_t SubtagLength>
    132 class LanguageTagSubtag final {
    133  uint8_t mLength = 0;
    134  char mChars[SubtagLength] = {};  // zero initialize
    135 
    136 public:
    137  LanguageTagSubtag() = default;
    138 
    139  LanguageTagSubtag(const LanguageTagSubtag& aOther) {
    140    std::copy_n(aOther.mChars, SubtagLength, mChars);
    141    mLength = aOther.mLength;
    142  }
    143 
    144  template <typename CharT>
    145  explicit LanguageTagSubtag(mozilla::Span<const CharT> str) {
    146    Set(str);
    147  }
    148 
    149  LanguageTagSubtag& operator=(const LanguageTagSubtag& aOther) {
    150    std::copy_n(aOther.mChars, SubtagLength, mChars);
    151    mLength = aOther.mLength;
    152    return *this;
    153  }
    154 
    155  size_t Length() const { return mLength; }
    156  bool Missing() const { return mLength == 0; }
    157  bool Present() const { return mLength > 0; }
    158 
    159  mozilla::Span<const char> Span() const { return {mChars, mLength}; }
    160 
    161  template <typename CharT>
    162  void Set(mozilla::Span<const CharT> str) {
    163    MOZ_ASSERT(str.size() <= SubtagLength);
    164    std::copy_n(str.data(), str.size(), mChars);
    165    mLength = str.size();
    166  }
    167 
    168  // The toXYZCase() methods are using |SubtagLength| instead of |length()|,
    169  // because current compilers (tested GCC and Clang) can't infer the maximum
    170  // string length - even when using hints like |std::min| - and instead are
    171  // emitting SIMD optimized code. Using a fixed sized length avoids emitting
    172  // the SIMD code. (Emitting SIMD code doesn't make sense here, because the
    173  // SIMD code only kicks in for long strings.) A fixed length will
    174  // additionally ensure the compiler unrolls the loop in the case conversion
    175  // code.
    176 
    177  void ToLowerCase() { AsciiToLowerCase(mChars, SubtagLength, mChars); }
    178 
    179  void ToUpperCase() { AsciiToUpperCase(mChars, SubtagLength, mChars); }
    180 
    181  void ToTitleCase() { AsciiToTitleCase(mChars, SubtagLength, mChars); }
    182 
    183  template <size_t N>
    184  bool EqualTo(const char (&str)[N]) const {
    185    static_assert(N - 1 <= SubtagLength,
    186                  "subtag literals must not exceed the maximum subtag length");
    187 
    188    return mLength == N - 1 && memcmp(mChars, str, N - 1) == 0;
    189  }
    190 };
    191 
    192 using LanguageSubtag = LanguageTagSubtag<LanguageTagLimits::LanguageLength>;
    193 using ScriptSubtag = LanguageTagSubtag<LanguageTagLimits::ScriptLength>;
    194 using RegionSubtag = LanguageTagSubtag<LanguageTagLimits::RegionLength>;
    195 using VariantSubtag = LanguageTagSubtag<LanguageTagLimits::VariantLength>;
    196 
    197 using Latin1Char = unsigned char;
    198 using UniqueChars = UniquePtr<char[]>;
    199 
    200 /**
    201 * Object representing a Unicode BCP 47 locale identifier.
    202 *
    203 * All subtags are already in canonicalized case.
    204 */
    205 class MOZ_STACK_CLASS Locale final {
    206 public:
    207  using VariantsVector = Vector<VariantSubtag, 2>;
    208  using ExtensionsVector = Vector<UniqueChars, 2>;
    209 
    210 private:
    211  LanguageSubtag mLanguage = {};
    212  ScriptSubtag mScript = {};
    213  RegionSubtag mRegion = {};
    214 
    215  VariantsVector mVariants;
    216  ExtensionsVector mExtensions;
    217  UniqueChars mPrivateUse = nullptr;
    218 
    219  friend class LocaleParser;
    220 
    221 public:
    222  enum class CanonicalizationError : uint8_t {
    223    DuplicateVariant,
    224    InternalError,
    225    OutOfMemory,
    226  };
    227 
    228 private:
    229  Result<Ok, CanonicalizationError> CanonicalizeUnicodeExtension(
    230      UniqueChars& unicodeExtension);
    231 
    232  Result<Ok, CanonicalizationError> CanonicalizeTransformExtension(
    233      UniqueChars& transformExtension);
    234 
    235 public:
    236  static bool LanguageMapping(LanguageSubtag& aLanguage);
    237  static bool ComplexLanguageMapping(const LanguageSubtag& aLanguage);
    238 
    239 private:
    240  static bool ScriptMapping(ScriptSubtag& aScript);
    241  static bool RegionMapping(RegionSubtag& aRegion);
    242  static bool ComplexRegionMapping(const RegionSubtag& aRegion);
    243 
    244  void PerformComplexLanguageMappings();
    245  void PerformComplexRegionMappings();
    246  [[nodiscard]] bool PerformVariantMappings();
    247 
    248  [[nodiscard]] bool UpdateLegacyMappings();
    249 
    250  static bool SignLanguageMapping(LanguageSubtag& aLanguage,
    251                                  const RegionSubtag& aRegion);
    252 
    253  static const char* ReplaceTransformExtensionType(
    254      mozilla::Span<const char> aKey, mozilla::Span<const char> aType);
    255 
    256  static mozilla::Span<const char> ToSpan(const UniqueChars& aChars) {
    257    return MakeStringSpan(aChars.get());
    258  }
    259 
    260  template <size_t N>
    261  static mozilla::Span<const char> ToSpan(const LanguageTagSubtag<N>& aSubtag) {
    262    return aSubtag.Span();
    263  }
    264 
    265 public:
    266  /**
    267   * Given a Unicode key and type, return the null-terminated preferred
    268   * replacement for that type if there is one, or null if there is none, e.g.
    269   * in effect
    270   * |ReplaceUnicodeExtensionType("ca", "islamicc") == "islamic-civil"|
    271   * and
    272   * |ReplaceUnicodeExtensionType("ca", "islamic-civil") == nullptr|.
    273   */
    274  static const char* ReplaceUnicodeExtensionType(
    275      mozilla::Span<const char> aKey, mozilla::Span<const char> aType);
    276 
    277 public:
    278  Locale() = default;
    279  Locale(const Locale&) = delete;
    280  Locale& operator=(const Locale&) = delete;
    281  Locale(Locale&&) = default;
    282  Locale& operator=(Locale&&) = default;
    283 
    284  template <class Vec>
    285  class SubtagIterator {
    286    using Iter = decltype(std::declval<const Vec>().begin());
    287 
    288    Iter mIter;
    289 
    290   public:
    291    explicit SubtagIterator(Iter iter) : mIter(iter) {}
    292 
    293    // std::iterator traits.
    294    using iterator_category = std::input_iterator_tag;
    295    using value_type = Span<const char>;
    296    using difference_type = ptrdiff_t;
    297    using pointer = value_type*;
    298    using reference = value_type&;
    299 
    300    SubtagIterator& operator++() {
    301      mIter++;
    302      return *this;
    303    }
    304 
    305    SubtagIterator operator++(int) {
    306      SubtagIterator result = *this;
    307      ++(*this);
    308      return result;
    309    }
    310 
    311    bool operator==(const SubtagIterator& aOther) const {
    312      return mIter == aOther.mIter;
    313    }
    314 
    315    bool operator!=(const SubtagIterator& aOther) const {
    316      return !(*this == aOther);
    317    }
    318 
    319    value_type operator*() const { return ToSpan(*mIter); }
    320  };
    321 
    322  template <typename T, size_t N>
    323  class SubtagEnumeration {
    324    using Vec = Vector<T, N>;
    325 
    326    const Vec& mVector;
    327 
    328   public:
    329    explicit SubtagEnumeration(const Vec& aVector) : mVector(aVector) {}
    330 
    331    size_t length() const { return mVector.length(); }
    332    bool empty() const { return mVector.empty(); }
    333 
    334    auto begin() const { return SubtagIterator<Vec>(mVector.begin()); }
    335    auto end() const { return SubtagIterator<Vec>(mVector.end()); }
    336 
    337    Span<const char> operator[](size_t aIndex) const {
    338      return ToSpan(mVector[aIndex]);
    339    }
    340  };
    341 
    342  const LanguageSubtag& Language() const { return mLanguage; }
    343  const ScriptSubtag& Script() const { return mScript; }
    344  const RegionSubtag& Region() const { return mRegion; }
    345  auto Variants() const { return SubtagEnumeration(mVariants); }
    346  auto Extensions() const { return SubtagEnumeration(mExtensions); }
    347  Maybe<Span<const char>> PrivateUse() const {
    348    if (const char* p = mPrivateUse.get()) {
    349      return Some(MakeStringSpan(p));
    350    }
    351    return Nothing();
    352  }
    353 
    354  /**
    355   * Return the Unicode extension subtag or Nothing if not present.
    356   */
    357  Maybe<Span<const char>> GetUnicodeExtension() const;
    358 
    359 private:
    360  ptrdiff_t UnicodeExtensionIndex() const;
    361 
    362 public:
    363  /**
    364   * Set the language subtag. The input must be a valid language subtag.
    365   */
    366  template <size_t N>
    367  void SetLanguage(const char (&aLanguage)[N]) {
    368    mozilla::Span<const char> span(aLanguage, N - 1);
    369    MOZ_ASSERT(IsStructurallyValidLanguageTag(span));
    370    mLanguage.Set(span);
    371  }
    372 
    373  /**
    374   * Set the language subtag. The input must be a valid language subtag.
    375   */
    376  void SetLanguage(const LanguageSubtag& aLanguage) {
    377    MOZ_ASSERT(IsStructurallyValidLanguageTag(aLanguage.Span()));
    378    mLanguage.Set(aLanguage.Span());
    379  }
    380 
    381  /**
    382   * Set the script subtag. The input must be a valid script subtag.
    383   */
    384  template <size_t N>
    385  void SetScript(const char (&aScript)[N]) {
    386    mozilla::Span<const char> span(aScript, N - 1);
    387    MOZ_ASSERT(IsStructurallyValidScriptTag(span));
    388    mScript.Set(span);
    389  }
    390 
    391  /**
    392   * Set the script subtag. The input must be a valid script subtag or the empty
    393   * string.
    394   */
    395  void SetScript(const ScriptSubtag& aScript) {
    396    MOZ_ASSERT(aScript.Missing() ||
    397               IsStructurallyValidScriptTag(aScript.Span()));
    398    mScript.Set(aScript.Span());
    399  }
    400 
    401  /**
    402   * Set the region subtag. The input must be a valid region subtag.
    403   */
    404  template <size_t N>
    405  void SetRegion(const char (&aRegion)[N]) {
    406    mozilla::Span<const char> span(aRegion, N - 1);
    407    MOZ_ASSERT(IsStructurallyValidRegionTag(span));
    408    mRegion.Set(span);
    409  }
    410 
    411  /**
    412   * Set the region subtag. The input must be a valid region subtag or the empty
    413   * empty string.
    414   */
    415  void SetRegion(const RegionSubtag& aRegion) {
    416    MOZ_ASSERT(aRegion.Missing() ||
    417               IsStructurallyValidRegionTag(aRegion.Span()));
    418    mRegion.Set(aRegion.Span());
    419  }
    420 
    421  /**
    422   * Set the variant subtags. Each element must be a valid variant subtag.
    423   */
    424  void SetVariants(VariantsVector&& aVariants) {
    425    MOZ_ASSERT(std::all_of(
    426        aVariants.begin(), aVariants.end(), [](const auto& variant) {
    427          return IsStructurallyValidVariantTag(variant.Span());
    428        }));
    429    mVariants = std::move(aVariants);
    430  }
    431 
    432  /**
    433   * Remove all variant subtags.
    434   */
    435  void ClearVariants() { mVariants.clearAndFree(); }
    436 
    437  /**
    438   * Set the Unicode extension subtag. The input must be a valid Unicode
    439   * extension subtag.
    440   */
    441  ICUResult SetUnicodeExtension(Span<const char> aExtension);
    442 
    443  /**
    444   * Remove any Unicode extension subtag if present.
    445   */
    446  void ClearUnicodeExtension();
    447 
    448  /** Canonicalize the base-name (language, script, region, variant) subtags. */
    449  Result<Ok, CanonicalizationError> CanonicalizeBaseName();
    450 
    451  /**
    452   * Canonicalize all extension subtags.
    453   */
    454  Result<Ok, CanonicalizationError> CanonicalizeExtensions();
    455 
    456  /**
    457   * Canonicalizes the given structurally valid Unicode BCP 47 locale
    458   * identifier, including regularized case of subtags. For example, the
    459   * locale Zh-haNS-bu-variant2-Variant1-u-ca-chinese-t-Zh-laTN-x-PRIVATE,
    460   * where
    461   *
    462   *     Zh             ; 2*3ALPHA
    463   *     -haNS          ; ["-" script]
    464   *     -bu            ; ["-" region]
    465   *     -variant2      ; *("-" variant)
    466   *     -Variant1
    467   *     -u-ca-chinese  ; *("-" extension)
    468   *     -t-Zh-laTN
    469   *     -x-PRIVATE     ; ["-" privateuse]
    470   *
    471   * becomes zh-Hans-MM-variant1-variant2-t-zh-latn-u-ca-chinese-x-private
    472   *
    473   * Spec: ECMAScript Internationalization API Specification, 6.2.3.
    474   */
    475  Result<Ok, CanonicalizationError> Canonicalize() {
    476    MOZ_TRY(CanonicalizeBaseName());
    477    return CanonicalizeExtensions();
    478  }
    479 
    480  /**
    481   * Fill the buffer with a string representation of the locale.
    482   */
    483  template <typename B>
    484  ICUResult ToString(B& aBuffer) const {
    485    static_assert(std::is_same_v<typename B::CharType, char>);
    486 
    487    size_t capacity = ToStringCapacity();
    488 
    489    // Attempt to reserve needed capacity
    490    if (!aBuffer.reserve(capacity)) {
    491      return Err(ICUError::OutOfMemory);
    492    }
    493 
    494    size_t offset = ToStringAppend(aBuffer.data());
    495 
    496    MOZ_ASSERT(capacity == offset);
    497    aBuffer.written(offset);
    498 
    499    return Ok();
    500  }
    501 
    502  /**
    503   * Add likely-subtags to the locale.
    504   *
    505   * Spec: <https://www.unicode.org/reports/tr35/#Likely_Subtags>
    506   */
    507  ICUResult AddLikelySubtags();
    508 
    509  /**
    510   * Remove likely-subtags from the locale.
    511   *
    512   * Spec: <https://www.unicode.org/reports/tr35/#Likely_Subtags>
    513   */
    514  ICUResult RemoveLikelySubtags();
    515 
    516  /**
    517   * Returns the default locale as an ICU locale identifier. The returned string
    518   * is NOT a valid BCP 47 locale!
    519   *
    520   * Also see <https://unicode-org.github.io/icu/userguide/locale>.
    521   */
    522  static const char* GetDefaultLocale() { return uloc_getDefault(); }
    523 
    524  /**
    525   * Returns an iterator over all supported locales.
    526   *
    527   * The returned strings are ICU locale identifiers and NOT BCP 47 language
    528   * tags.
    529   *
    530   * Also see <https://unicode-org.github.io/icu/userguide/locale>.
    531   */
    532  static auto GetAvailableLocales() {
    533    return AvailableLocalesEnumeration<uloc_countAvailable,
    534                                       uloc_getAvailable>();
    535  }
    536 
    537 private:
    538  static UniqueChars DuplicateStringToUniqueChars(const char* aStr);
    539  static UniqueChars DuplicateStringToUniqueChars(Span<const char> aStr);
    540  size_t ToStringCapacity() const;
    541  size_t ToStringAppend(char* aBuffer) const;
    542 };
    543 
    544 /**
    545 * Parser for Unicode BCP 47 locale identifiers.
    546 *
    547 * <https://unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers>
    548 */
    549 class MOZ_STACK_CLASS LocaleParser final {
    550 public:
    551  enum class ParserError : uint8_t {
    552    // Input was not parseable as a locale, subtag or extension.
    553    NotParseable,
    554    // Unable to allocate memory for the parser to operate.
    555    OutOfMemory,
    556  };
    557 
    558  // Exposed as |public| for |MOZ_MAKE_ENUM_CLASS_BITWISE_OPERATORS|.
    559  enum class TokenKind : uint8_t {
    560    None = 0b000,
    561    Alpha = 0b001,
    562    Digit = 0b010,
    563    AlphaDigit = 0b011,
    564    Error = 0b100
    565  };
    566 
    567 private:
    568  class Token final {
    569    size_t mIndex;
    570    size_t mLength;
    571    TokenKind mKind;
    572 
    573   public:
    574    Token(TokenKind aKind, size_t aIndex, size_t aLength)
    575        : mIndex(aIndex), mLength(aLength), mKind(aKind) {}
    576 
    577    TokenKind Kind() const { return mKind; }
    578    size_t Index() const { return mIndex; }
    579    size_t Length() const { return mLength; }
    580 
    581    bool IsError() const { return mKind == TokenKind::Error; }
    582    bool IsNone() const { return mKind == TokenKind::None; }
    583    bool IsAlpha() const { return mKind == TokenKind::Alpha; }
    584    bool IsDigit() const { return mKind == TokenKind::Digit; }
    585    bool IsAlphaDigit() const { return mKind == TokenKind::AlphaDigit; }
    586  };
    587 
    588  const char* mLocale;
    589  size_t mLength;
    590  size_t mIndex = 0;
    591 
    592  explicit LocaleParser(Span<const char> aLocale)
    593      : mLocale(aLocale.data()), mLength(aLocale.size()) {}
    594 
    595  char CharAt(size_t aIndex) const { return mLocale[aIndex]; }
    596 
    597  // Copy the token characters into |subtag|.
    598  template <size_t N>
    599  void CopyChars(const Token& aTok, LanguageTagSubtag<N>& aSubtag) const {
    600    aSubtag.Set(mozilla::Span(mLocale + aTok.Index(), aTok.Length()));
    601  }
    602 
    603  // Create a string copy of |length| characters starting at |index|.
    604  UniqueChars Chars(size_t aIndex, size_t aLength) const;
    605 
    606  // Create a string copy of the token characters.
    607  UniqueChars Chars(const Token& aTok) const {
    608    return Chars(aTok.Index(), aTok.Length());
    609  }
    610 
    611  UniqueChars Extension(const Token& aStart, const Token& aEnd) const {
    612    MOZ_ASSERT(aStart.Index() < aEnd.Index());
    613 
    614    size_t length = aEnd.Index() - 1 - aStart.Index();
    615    return Chars(aStart.Index(), length);
    616  }
    617 
    618  Token NextToken();
    619 
    620  // unicode_language_subtag = alpha{2,3} | alpha{5,8} ;
    621  //
    622  // Four character language subtags are not allowed in Unicode BCP 47 locale
    623  // identifiers. Also see the comparison to Unicode CLDR locale identifiers in
    624  // <https://unicode.org/reports/tr35/#BCP_47_Conformance>.
    625  bool IsLanguage(const Token& aTok) const {
    626    return aTok.IsAlpha() && ((2 <= aTok.Length() && aTok.Length() <= 3) ||
    627                              (5 <= aTok.Length() && aTok.Length() <= 8));
    628  }
    629 
    630  // unicode_script_subtag = alpha{4} ;
    631  bool IsScript(const Token& aTok) const {
    632    return aTok.IsAlpha() && aTok.Length() == 4;
    633  }
    634 
    635  // unicode_region_subtag = (alpha{2} | digit{3}) ;
    636  bool IsRegion(const Token& aTok) const {
    637    return (aTok.IsAlpha() && aTok.Length() == 2) ||
    638           (aTok.IsDigit() && aTok.Length() == 3);
    639  }
    640 
    641  // unicode_variant_subtag = (alphanum{5,8} | digit alphanum{3}) ;
    642  bool IsVariant(const Token& aTok) const {
    643    return (5 <= aTok.Length() && aTok.Length() <= 8) ||
    644           (aTok.Length() == 4 && mozilla::IsAsciiDigit(CharAt(aTok.Index())));
    645  }
    646 
    647  // Returns the code unit of the first character at the given singleton token.
    648  // Always returns the lower case form of an alphabetical character.
    649  char SingletonKey(const Token& aTok) const {
    650    MOZ_ASSERT(aTok.Length() == 1);
    651    return AsciiToLowerCase(CharAt(aTok.Index()));
    652  }
    653 
    654  // extensions = unicode_locale_extensions |
    655  //              transformed_extensions |
    656  //              other_extensions ;
    657  //
    658  // unicode_locale_extensions = sep [uU] ((sep keyword)+ |
    659  //                                       (sep attribute)+ (sep keyword)*) ;
    660  //
    661  // transformed_extensions = sep [tT] ((sep tlang (sep tfield)*) |
    662  //                                    (sep tfield)+) ;
    663  //
    664  // other_extensions = sep [alphanum-[tTuUxX]] (sep alphanum{2,8})+ ;
    665  bool IsExtensionStart(const Token& aTok) const {
    666    return aTok.Length() == 1 && SingletonKey(aTok) != 'x';
    667  }
    668 
    669  // other_extensions = sep [alphanum-[tTuUxX]] (sep alphanum{2,8})+ ;
    670  bool IsOtherExtensionPart(const Token& aTok) const {
    671    return 2 <= aTok.Length() && aTok.Length() <= 8;
    672  }
    673 
    674  // unicode_locale_extensions = sep [uU] ((sep keyword)+ |
    675  //                                       (sep attribute)+ (sep keyword)*) ;
    676  // keyword = key (sep type)? ;
    677  bool IsUnicodeExtensionPart(const Token& aTok) const {
    678    return IsUnicodeExtensionKey(aTok) || IsUnicodeExtensionType(aTok) ||
    679           IsUnicodeExtensionAttribute(aTok);
    680  }
    681 
    682  // attribute = alphanum{3,8} ;
    683  bool IsUnicodeExtensionAttribute(const Token& aTok) const {
    684    return 3 <= aTok.Length() && aTok.Length() <= 8;
    685  }
    686 
    687  // key = alphanum alpha ;
    688  bool IsUnicodeExtensionKey(const Token& aTok) const {
    689    return aTok.Length() == 2 &&
    690           mozilla::IsAsciiAlpha(CharAt(aTok.Index() + 1));
    691  }
    692 
    693  // type = alphanum{3,8} (sep alphanum{3,8})* ;
    694  bool IsUnicodeExtensionType(const Token& aTok) const {
    695    return 3 <= aTok.Length() && aTok.Length() <= 8;
    696  }
    697 
    698  // tkey = alpha digit ;
    699  bool IsTransformExtensionKey(const Token& aTok) const {
    700    return aTok.Length() == 2 && mozilla::IsAsciiAlpha(CharAt(aTok.Index())) &&
    701           mozilla::IsAsciiDigit(CharAt(aTok.Index() + 1));
    702  }
    703 
    704  // tvalue = (sep alphanum{3,8})+ ;
    705  bool IsTransformExtensionPart(const Token& aTok) const {
    706    return 3 <= aTok.Length() && aTok.Length() <= 8;
    707  }
    708 
    709  // pu_extensions = sep [xX] (sep alphanum{1,8})+ ;
    710  bool IsPrivateUseStart(const Token& aTok) const {
    711    return aTok.Length() == 1 && SingletonKey(aTok) == 'x';
    712  }
    713 
    714  // pu_extensions = sep [xX] (sep alphanum{1,8})+ ;
    715  bool IsPrivateUsePart(const Token& aTok) const {
    716    return 1 <= aTok.Length() && aTok.Length() <= 8;
    717  }
    718 
    719  // Helper function for use in |ParseBaseName| and
    720  // |ParseTlangInTransformExtension|.  Do not use this directly!
    721  static Result<Ok, ParserError> InternalParseBaseName(
    722      LocaleParser& aLocaleParser, Locale& aTag, Token& aTok);
    723 
    724  // Parse the `unicode_language_id` production, i.e. the
    725  // language/script/region/variants portion of a locale, into |aTag|.
    726  // |aTok| must be the current token.
    727  static Result<Ok, ParserError> ParseBaseName(LocaleParser& aLocaleParser,
    728                                               Locale& aTag, Token& aTok) {
    729    return InternalParseBaseName(aLocaleParser, aTag, aTok);
    730  }
    731 
    732  // Parse the `tlang` production within a parsed 't' transform extension.
    733  // The precise requirements for "previously parsed" are:
    734  //
    735  //   * the input begins from current token |tok| with a valid `tlang`
    736  //   * the `tlang` is wholly lowercase (*not* canonical case)
    737  //   * variant subtags in the `tlang` may contain duplicates and be
    738  //     unordered
    739  //
    740  // Return an error on internal failure. Otherwise, return a success value. If
    741  // there was no `tlang`, then |tag.language().missing()|. But if there was a
    742  // `tlang`, then |tag| is filled with subtags exactly as they appeared in the
    743  // parse input.
    744  static Result<Ok, ParserError> ParseTlangInTransformExtension(
    745      LocaleParser& aLocaleParser, Locale& aTag, Token& aTok) {
    746    MOZ_ASSERT(aLocaleParser.IsLanguage(aTok));
    747    return InternalParseBaseName(aLocaleParser, aTag, aTok);
    748  }
    749 
    750  friend class Locale;
    751 
    752  class Range final {
    753    size_t mBegin;
    754    size_t mLength;
    755 
    756   public:
    757    Range(size_t aBegin, size_t aLength) : mBegin(aBegin), mLength(aLength) {}
    758 
    759    size_t Begin() const { return mBegin; }
    760    size_t Length() const { return mLength; }
    761  };
    762 
    763  using TFieldVector = Vector<Range, 8>;
    764  using AttributesVector = Vector<Range, 8>;
    765  using KeywordsVector = Vector<Range, 8>;
    766 
    767  // Parse |extension|, which must be a validated, fully lowercase
    768  // `transformed_extensions` subtag, and fill |tag| and |fields| from the
    769  // `tlang` and `tfield` components. Data in |tag| is lowercase, consistent
    770  // with |extension|.
    771  static Result<Ok, ParserError> ParseTransformExtension(
    772      mozilla::Span<const char> aExtension, Locale& aTag,
    773      TFieldVector& aFields);
    774 
    775  // Parse |extension|, which must be a validated, fully lowercase
    776  // `unicode_locale_extensions` subtag, and fill |attributes| and |keywords|
    777  // from the `attribute` and `keyword` components.
    778  static Result<Ok, ParserError> ParseUnicodeExtension(
    779      mozilla::Span<const char> aExtension, AttributesVector& aAttributes,
    780      KeywordsVector& aKeywords);
    781 
    782 public:
    783  // Parse the input string as a locale.
    784  //
    785  // NOTE: |aTag| must be a new, empty Locale.
    786  static Result<Ok, ParserError> TryParse(Span<const char> aLocale,
    787                                          Locale& aTag);
    788 
    789  // Parse the input string as the base-name parts (language, script, region,
    790  // variants) of a locale.
    791  //
    792  // NOTE: |aTag| must be a new, empty Locale.
    793  static Result<Ok, ParserError> TryParseBaseName(Span<const char> aLocale,
    794                                                  Locale& aTag);
    795 
    796  // Return Ok() iff |extension| can be parsed as a Unicode extension subtag.
    797  static Result<Ok, ParserError> CanParseUnicodeExtension(
    798      Span<const char> aExtension);
    799 
    800  // Return Ok() iff |unicodeType| can be parsed as a Unicode extension type.
    801  static Result<Ok, ParserError> CanParseUnicodeExtensionType(
    802      Span<const char> aUnicodeType);
    803 };
    804 
    805 MOZ_MAKE_ENUM_CLASS_BITWISE_OPERATORS(LocaleParser::TokenKind)
    806 
    807 }  // namespace mozilla::intl
    808 
    809 #endif /* intl_components_Locale_h */
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE