[ tor-browser ].git.dasho

Locale.cpp (46459B)
      1 /* This Source Code Form is subject to the terms of the Mozilla Public
      2 * License, v. 2.0. If a copy of the MPL was not distributed with this
      3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      4 
      5 #include "mozilla/intl/Locale.h"
      6 
      7 #include "mozilla/Assertions.h"
      8 #include "mozilla/Span.h"
      9 #include "mozilla/TextUtils.h"
     10 #include "mozilla/Variant.h"
     11 
     12 #include "ICU4CGlue.h"
     13 
     14 #include <algorithm>
     15 #include <iterator>
     16 #include <stddef.h>
     17 #include <stdint.h>
     18 #include <string.h>
     19 #include <utility>
     20 
     21 #include "unicode/uloc.h"
     22 #include "unicode/utypes.h"
     23 
     24 namespace mozilla::intl {
     25 
     26 using namespace intl::LanguageTagLimits;
     27 
     28 template <typename CharT>
     29 bool IsStructurallyValidLanguageTag(Span<const CharT> aLanguage) {
     30  // unicode_language_subtag = alpha{2,3} | alpha{5,8};
     31  size_t length = aLanguage.size();
     32  const CharT* str = aLanguage.data();
     33  return ((2 <= length && length <= 3) || (5 <= length && length <= 8)) &&
     34         std::all_of(str, str + length, IsAsciiAlpha<CharT>);
     35 }
     36 
     37 template bool IsStructurallyValidLanguageTag(Span<const char> aLanguage);
     38 template bool IsStructurallyValidLanguageTag(Span<const Latin1Char> aLanguage);
     39 template bool IsStructurallyValidLanguageTag(Span<const char16_t> aLanguage);
     40 
     41 template <typename CharT>
     42 bool IsStructurallyValidScriptTag(Span<const CharT> aScript) {
     43  // unicode_script_subtag = alpha{4} ;
     44  size_t length = aScript.size();
     45  const CharT* str = aScript.data();
     46  return length == 4 && std::all_of(str, str + length, IsAsciiAlpha<CharT>);
     47 }
     48 
     49 template bool IsStructurallyValidScriptTag(Span<const char> aScript);
     50 template bool IsStructurallyValidScriptTag(Span<const Latin1Char> aScript);
     51 template bool IsStructurallyValidScriptTag(Span<const char16_t> aScript);
     52 
     53 template <typename CharT>
     54 bool IsStructurallyValidRegionTag(Span<const CharT> aRegion) {
     55  // unicode_region_subtag = (alpha{2} | digit{3}) ;
     56  size_t length = aRegion.size();
     57  const CharT* str = aRegion.data();
     58  return (length == 2 && std::all_of(str, str + length, IsAsciiAlpha<CharT>)) ||
     59         (length == 3 && std::all_of(str, str + length, IsAsciiDigit<CharT>));
     60 }
     61 
     62 template bool IsStructurallyValidRegionTag(Span<const char> aRegion);
     63 template bool IsStructurallyValidRegionTag(Span<const Latin1Char> aRegion);
     64 template bool IsStructurallyValidRegionTag(Span<const char16_t> aRegion);
     65 
     66 template <typename CharT>
     67 bool IsStructurallyValidVariantTag(Span<const CharT> aVariant) {
     68  // unicode_variant_subtag = (alphanum{5,8} | digit alphanum{3}) ;
     69  size_t length = aVariant.size();
     70  const CharT* str = aVariant.data();
     71  return ((5 <= length && length <= 8) ||
     72          (length == 4 && IsAsciiDigit(str[0]))) &&
     73         std::all_of(str, str + length, IsAsciiAlphanumeric<CharT>);
     74 }
     75 
     76 template bool IsStructurallyValidVariantTag(Span<const char> aVariant);
     77 template bool IsStructurallyValidVariantTag(Span<const Latin1Char> aVariant);
     78 template bool IsStructurallyValidVariantTag(Span<const char16_t> aVariant);
     79 
     80 #ifdef DEBUG
     81 bool IsStructurallyValidUnicodeExtensionTag(Span<const char> aExtension) {
     82  return LocaleParser::CanParseUnicodeExtension(aExtension).isOk();
     83 }
     84 
     85 static bool IsStructurallyValidExtensionTag(Span<const char> aExtension) {
     86  // other_extensions = sep [alphanum-[tTuUxX]] (sep alphanum{2,8})+ ;
     87  // NB: Allow any extension, including Unicode and Transform here, because
     88  // this function is only used for an assertion.
     89 
     90  size_t length = aExtension.size();
     91  const char* str = aExtension.data();
     92  const char* const end = aExtension.data() + length;
     93  if (length <= 2) {
     94    return false;
     95  }
     96  if (!IsAsciiAlphanumeric(str[0]) || str[0] == 'x' || str[0] == 'X') {
     97    return false;
     98  }
     99  str++;
    100  if (*str++ != '-') {
    101    return false;
    102  }
    103  while (true) {
    104    const char* sep =
    105        reinterpret_cast<const char*>(memchr(str, '-', end - str));
    106    size_t len = (sep ? sep : end) - str;
    107    if (len < 2 || len > 8 ||
    108        !std::all_of(str, str + len, IsAsciiAlphanumeric<char>)) {
    109      return false;
    110    }
    111    if (!sep) {
    112      return true;
    113    }
    114    str = sep + 1;
    115  }
    116 }
    117 
    118 bool IsStructurallyValidPrivateUseTag(Span<const char> aPrivateUse) {
    119  // pu_extensions = sep [xX] (sep alphanum{1,8})+ ;
    120 
    121  size_t length = aPrivateUse.size();
    122  const char* str = aPrivateUse.data();
    123  const char* const end = aPrivateUse.data() + length;
    124  if (length <= 2) {
    125    return false;
    126  }
    127  if (str[0] != 'x' && str[0] != 'X') {
    128    return false;
    129  }
    130  str++;
    131  if (*str++ != '-') {
    132    return false;
    133  }
    134  while (true) {
    135    const char* sep =
    136        reinterpret_cast<const char*>(memchr(str, '-', end - str));
    137    size_t len = (sep ? sep : end) - str;
    138    if (len == 0 || len > 8 ||
    139        !std::all_of(str, str + len, IsAsciiAlphanumeric<char>)) {
    140      return false;
    141    }
    142    if (!sep) {
    143      return true;
    144    }
    145    str = sep + 1;
    146  }
    147 }
    148 #endif
    149 
    150 ptrdiff_t Locale::UnicodeExtensionIndex() const {
    151  // The extension subtags aren't necessarily sorted, so we can't use binary
    152  // search here.
    153  auto p = std::find_if(
    154      mExtensions.begin(), mExtensions.end(),
    155      [](const auto& ext) { return ext[0] == 'u' || ext[0] == 'U'; });
    156  if (p != mExtensions.end()) {
    157    return std::distance(mExtensions.begin(), p);
    158  }
    159  return -1;
    160 }
    161 
    162 Maybe<Span<const char>> Locale::GetUnicodeExtension() const {
    163  ptrdiff_t index = UnicodeExtensionIndex();
    164  if (index >= 0) {
    165    return Some(MakeStringSpan(mExtensions[index].get()));
    166  }
    167  return Nothing();
    168 }
    169 
    170 ICUResult Locale::SetUnicodeExtension(Span<const char> aExtension) {
    171  MOZ_ASSERT(IsStructurallyValidUnicodeExtensionTag(aExtension));
    172 
    173  auto duplicated = DuplicateStringToUniqueChars(aExtension);
    174 
    175  // Replace the existing Unicode extension subtag or append a new one.
    176  ptrdiff_t index = UnicodeExtensionIndex();
    177  if (index >= 0) {
    178    mExtensions[index] = std::move(duplicated);
    179    return Ok();
    180  }
    181  if (!mExtensions.append(std::move(duplicated))) {
    182    return Err(ICUError::OutOfMemory);
    183  }
    184  return Ok();
    185 }
    186 
    187 void Locale::ClearUnicodeExtension() {
    188  ptrdiff_t index = UnicodeExtensionIndex();
    189  if (index >= 0) {
    190    mExtensions.erase(mExtensions.begin() + index);
    191  }
    192 }
    193 
    194 template <size_t InitialCapacity>
    195 static void SortAlphabetically(
    196    Vector<VariantSubtag, InitialCapacity>& aVariants) {
    197  size_t length = aVariants.length();
    198 
    199  // Zero or one element lists are already sorted.
    200  if (length < 2) {
    201    return;
    202  }
    203 
    204  // Handle two element lists inline.
    205  if (length == 2) {
    206    if (aVariants[0].Span() > aVariants[1].Span()) {
    207      std::swap(aVariants[0], aVariants[1]);
    208    }
    209    return;
    210  }
    211 
    212  std::stable_sort(
    213      aVariants.begin(), aVariants.end(),
    214      [](const auto& a, const auto& b) { return a.Span() < b.Span(); });
    215 }
    216 
    217 template <size_t InitialCapacity>
    218 static bool SortAlphabetically(Vector<UniqueChars, InitialCapacity>& aSubtags) {
    219  size_t length = aSubtags.length();
    220 
    221  // Zero or one element lists are already sorted.
    222  if (length < 2) {
    223    return true;
    224  }
    225 
    226  // Handle two element lists inline.
    227  if (length == 2) {
    228    if (strcmp(aSubtags[0].get(), aSubtags[1].get()) > 0) {
    229      aSubtags[0].swap(aSubtags[1]);
    230    }
    231    return true;
    232  }
    233 
    234  Vector<char*, 8> scratch;
    235  if (!scratch.resizeUninitialized(length)) {
    236    return false;
    237  }
    238  for (size_t i = 0; i < length; i++) {
    239    scratch[i] = aSubtags[i].release();
    240  }
    241 
    242  std::stable_sort(
    243      scratch.begin(), scratch.end(),
    244      [](const char* a, const char* b) { return strcmp(a, b) < 0; });
    245 
    246  for (size_t i = 0; i < length; i++) {
    247    aSubtags[i] = UniqueChars(scratch[i]);
    248  }
    249  return true;
    250 }
    251 
    252 Result<Ok, Locale::CanonicalizationError> Locale::CanonicalizeBaseName() {
    253  // Per 6.2.3 CanonicalizeUnicodeLocaleId, the very first step is to
    254  // canonicalize the syntax by normalizing the case and ordering all subtags.
    255  // The canonical syntax form is specified in UTS 35, 3.2.1.
    256 
    257  // Language codes need to be in lower case. "JA" -> "ja"
    258  mLanguage.ToLowerCase();
    259  MOZ_ASSERT(IsStructurallyValidLanguageTag(Language().Span()));
    260 
    261  // The first character of a script code needs to be capitalized.
    262  // "hans" -> "Hans"
    263  mScript.ToTitleCase();
    264  MOZ_ASSERT(Script().Missing() ||
    265             IsStructurallyValidScriptTag(Script().Span()));
    266 
    267  // Region codes need to be in upper case. "bu" -> "BU"
    268  mRegion.ToUpperCase();
    269  MOZ_ASSERT(Region().Missing() ||
    270             IsStructurallyValidRegionTag(Region().Span()));
    271 
    272  // The canonical case for variant subtags is lowercase.
    273  for (auto& variant : mVariants) {
    274    variant.ToLowerCase();
    275    MOZ_ASSERT(IsStructurallyValidVariantTag(variant.Span()));
    276  }
    277 
    278  // Extensions and privateuse subtags are case normalized in the
    279  // |canonicalizeExtensions| method.
    280 
    281  // The second step in UTS 35, 3.2.1, is to order all subtags.
    282 
    283  if (mVariants.length() > 1) {
    284    // 1. Any variants are in alphabetical order.
    285    SortAlphabetically(mVariants);
    286 
    287    // Reject the Locale identifier if a duplicate variant was found, e.g.
    288    // "en-variant-Variant".
    289    const auto* duplicate = std::adjacent_find(
    290        mVariants.begin(), mVariants.end(),
    291        [](const auto& a, const auto& b) { return a.Span() == b.Span(); });
    292    if (duplicate != mVariants.end()) {
    293      return Err(CanonicalizationError::DuplicateVariant);
    294    }
    295  }
    296 
    297  // 2. Any extensions are in alphabetical order by their singleton.
    298  // 3. All attributes are sorted in alphabetical order.
    299  // 4. All keywords and tfields are sorted by alphabetical order of their keys,
    300  //    within their respective extensions.
    301  // 5. Any type or tfield value "true" is removed.
    302  // - A subsequent call to canonicalizeExtensions() will perform these steps.
    303 
    304  // 6.2.3 CanonicalizeUnicodeLocaleId, step 2 transforms the locale identifier
    305  // into its canonical form per UTS 3.2.1.
    306 
    307  // 1. Use the bcp47 data to replace keys, types, tfields, and tvalues by their
    308  // canonical forms.
    309  // - A subsequent call to canonicalizeExtensions() will perform this step.
    310 
    311  // 2. Replace aliases in the unicode_language_id and tlang (if any).
    312  // - tlang is handled in canonicalizeExtensions().
    313 
    314  // Replace deprecated language, region, and variant subtags with their
    315  // preferred mappings.
    316 
    317  if (!UpdateLegacyMappings()) {
    318    return Err(CanonicalizationError::OutOfMemory);
    319  }
    320 
    321  // Replace deprecated language subtags with their preferred values.
    322  if (!LanguageMapping(mLanguage) && ComplexLanguageMapping(mLanguage)) {
    323    PerformComplexLanguageMappings();
    324  }
    325 
    326  // Replace deprecated script subtags with their preferred values.
    327  if (Script().Present()) {
    328    ScriptMapping(mScript);
    329  }
    330 
    331  // Replace deprecated region subtags with their preferred values.
    332  if (Region().Present()) {
    333    if (!RegionMapping(mRegion) && ComplexRegionMapping(mRegion)) {
    334      PerformComplexRegionMappings();
    335    }
    336  }
    337 
    338  // Replace deprecated variant subtags with their preferred values.
    339  if (!PerformVariantMappings()) {
    340    return Err(CanonicalizationError::OutOfMemory);
    341  }
    342 
    343  // No extension replacements are currently present.
    344  // Private use sequences are left as is.
    345 
    346  // 3. Replace aliases in special key values.
    347  // - A subsequent call to canonicalizeExtensions() will perform this step.
    348 
    349  return Ok();
    350 }
    351 
    352 #ifdef DEBUG
    353 static bool IsAsciiLowercaseAlphanumericOrDash(Span<const char> aSpan) {
    354  const char* ptr = aSpan.data();
    355  size_t length = aSpan.size();
    356  return std::all_of(ptr, ptr + length, [](auto c) {
    357    return IsAsciiLowercaseAlpha(c) || IsAsciiDigit(c) || c == '-';
    358  });
    359 }
    360 #endif
    361 
    362 Result<Ok, Locale::CanonicalizationError> Locale::CanonicalizeExtensions() {
    363  // The canonical case for all extension subtags is lowercase.
    364  for (UniqueChars& extension : mExtensions) {
    365    char* extensionChars = extension.get();
    366    size_t extensionLength = strlen(extensionChars);
    367    AsciiToLowerCase(extensionChars, extensionLength, extensionChars);
    368 
    369    MOZ_ASSERT(
    370        IsStructurallyValidExtensionTag({extensionChars, extensionLength}));
    371  }
    372 
    373  // Any extensions are in alphabetical order by their singleton.
    374  // "u-ca-chinese-t-zh-latn" -> "t-zh-latn-u-ca-chinese"
    375  if (!SortAlphabetically(mExtensions)) {
    376    return Err(CanonicalizationError::OutOfMemory);
    377  }
    378 
    379  for (UniqueChars& extension : mExtensions) {
    380    if (extension[0] == 'u') {
    381      MOZ_TRY(CanonicalizeUnicodeExtension(extension));
    382    } else if (extension[0] == 't') {
    383      MOZ_TRY(CanonicalizeTransformExtension(extension));
    384    }
    385 
    386    MOZ_ASSERT(
    387        IsAsciiLowercaseAlphanumericOrDash(MakeStringSpan(extension.get())));
    388  }
    389 
    390  // The canonical case for privateuse subtags is lowercase.
    391  if (char* privateuse = mPrivateUse.get()) {
    392    size_t privateuseLength = strlen(privateuse);
    393    AsciiToLowerCase(privateuse, privateuseLength, privateuse);
    394 
    395    MOZ_ASSERT(
    396        IsStructurallyValidPrivateUseTag({privateuse, privateuseLength}));
    397  }
    398  return Ok();
    399 }
    400 
    401 template <size_t N>
    402 static inline bool AppendSpan(Vector<char, N>& vector, Span<const char> aSpan) {
    403  return vector.append(aSpan.data(), aSpan.size());
    404 }
    405 
    406 /**
    407 * CanonicalizeUnicodeExtension( attributes, keywords )
    408 *
    409 * Canonical syntax per
    410 * <https://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers>:
    411 *
    412 * - All attributes and keywords are in lowercase.
    413 *   - Note: The parser already converted keywords to lowercase.
    414 * - All attributes are sorted in alphabetical order.
    415 * - All keywords are sorted by alphabetical order of their keys.
    416 * - Any type value "true" is removed.
    417 *
    418 * Canonical form:
    419 * - All keys and types use the canonical form (from the name attribute;
    420 *   see Section 3.6.4 U Extension Data Files).
    421 */
    422 Result<Ok, Locale::CanonicalizationError> Locale::CanonicalizeUnicodeExtension(
    423    UniqueChars& aUnicodeExtension) {
    424  Span<const char> extension = MakeStringSpan(aUnicodeExtension.get());
    425  MOZ_ASSERT(extension[0] == 'u');
    426  MOZ_ASSERT(extension[1] == '-');
    427  MOZ_ASSERT(IsStructurallyValidExtensionTag(extension));
    428 
    429  LocaleParser::AttributesVector attributes;
    430  LocaleParser::KeywordsVector keywords;
    431 
    432  using Attribute = LocaleParser::AttributesVector::ElementType;
    433  using Keyword = LocaleParser::KeywordsVector::ElementType;
    434 
    435  if (LocaleParser::ParseUnicodeExtension(extension, attributes, keywords)
    436          .isErr()) {
    437    MOZ_ASSERT_UNREACHABLE("unexpected invalid Unicode extension subtag");
    438    return Err(CanonicalizationError::InternalError);
    439  }
    440 
    441  auto attributesLess = [extension](const Attribute& a, const Attribute& b) {
    442    auto astr = extension.Subspan(a.Begin(), a.Length());
    443    auto bstr = extension.Subspan(b.Begin(), b.Length());
    444    return astr < bstr;
    445  };
    446 
    447  // All attributes are sorted in alphabetical order.
    448  if (attributes.length() > 1) {
    449    std::stable_sort(attributes.begin(), attributes.end(), attributesLess);
    450  }
    451 
    452  auto keywordsLess = [extension](const Keyword& a, const Keyword& b) {
    453    auto astr = extension.Subspan(a.Begin(), UnicodeKeyLength);
    454    auto bstr = extension.Subspan(b.Begin(), UnicodeKeyLength);
    455    return astr < bstr;
    456  };
    457 
    458  // All keywords are sorted by alphabetical order of keys.
    459  if (keywords.length() > 1) {
    460    // Using a stable sort algorithm, guarantees that two keywords using the
    461    // same key are never reordered. That means for example
    462    // when we have the input "u-nu-thai-kf-false-nu-latn", we are guaranteed to
    463    // get the result "u-kf-false-nu-thai-nu-latn", i.e. "nu-thai" still occurs
    464    // before "nu-latn".
    465    // This is required so that deduplication below preserves the first keyword
    466    // for a given key and discards the rest.
    467    std::stable_sort(keywords.begin(), keywords.end(), keywordsLess);
    468  }
    469 
    470  Vector<char, 32> sb;
    471  if (!sb.append('u')) {
    472    return Err(CanonicalizationError::OutOfMemory);
    473  }
    474 
    475  // Append all Unicode extension attributes.
    476  for (size_t i = 0; i < attributes.length(); i++) {
    477    const auto& attribute = attributes[i];
    478    auto span = extension.Subspan(attribute.Begin(), attribute.Length());
    479 
    480    // Skip duplicate attributes.
    481    if (i > 0) {
    482      const auto& lastAttribute = attributes[i - 1];
    483      if (span ==
    484          extension.Subspan(lastAttribute.Begin(), lastAttribute.Length())) {
    485        continue;
    486      }
    487      MOZ_ASSERT(attributesLess(lastAttribute, attribute));
    488    }
    489 
    490    if (!sb.append('-')) {
    491      return Err(CanonicalizationError::OutOfMemory);
    492    }
    493    if (!AppendSpan(sb, span)) {
    494      return Err(CanonicalizationError::OutOfMemory);
    495    }
    496  }
    497 
    498  static constexpr size_t UnicodeKeyWithSepLength = UnicodeKeyLength + 1;
    499 
    500  using StringSpan = Span<const char>;
    501 
    502  static constexpr StringSpan True = MakeStringSpan("true");
    503 
    504  // Append all Unicode extension keywords.
    505  for (size_t i = 0; i < keywords.length(); i++) {
    506    const auto& keyword = keywords[i];
    507 
    508    // Skip duplicate keywords.
    509    if (i > 0) {
    510      const auto& lastKeyword = keywords[i - 1];
    511      if (extension.Subspan(keyword.Begin(), UnicodeKeyLength) ==
    512          extension.Subspan(lastKeyword.Begin(), UnicodeKeyLength)) {
    513        continue;
    514      }
    515      MOZ_ASSERT(keywordsLess(lastKeyword, keyword));
    516    }
    517 
    518    if (!sb.append('-')) {
    519      return Err(CanonicalizationError::OutOfMemory);
    520    }
    521 
    522    StringSpan span = extension.Subspan(keyword.Begin(), keyword.Length());
    523    if (span.size() == UnicodeKeyLength) {
    524      // Keyword without type value.
    525      if (!AppendSpan(sb, span)) {
    526        return Err(CanonicalizationError::OutOfMemory);
    527      }
    528    } else {
    529      StringSpan key = span.To(UnicodeKeyLength);
    530      StringSpan type = span.From(UnicodeKeyWithSepLength);
    531 
    532      // Search if there's a replacement for the current Unicode keyword.
    533      if (const char* replacement = ReplaceUnicodeExtensionType(key, type)) {
    534        StringSpan repl = MakeStringSpan(replacement);
    535        if (repl == True) {
    536          // Elide the type "true" if present in the replacement.
    537          if (!AppendSpan(sb, key)) {
    538            return Err(CanonicalizationError::OutOfMemory);
    539          }
    540        } else {
    541          // Otherwise append the Unicode key (including the separator) and the
    542          // replaced type.
    543          if (!AppendSpan(sb, span.To(UnicodeKeyWithSepLength))) {
    544            return Err(CanonicalizationError::OutOfMemory);
    545          }
    546          if (!AppendSpan(sb, repl)) {
    547            return Err(CanonicalizationError::OutOfMemory);
    548          }
    549        }
    550      } else {
    551        if (type == True) {
    552          // Elide the Unicode extension type "true".
    553          if (!AppendSpan(sb, key)) {
    554            return Err(CanonicalizationError::OutOfMemory);
    555          }
    556        } else {
    557          // Otherwise append the complete Unicode extension keyword.
    558          if (!AppendSpan(sb, span)) {
    559            return Err(CanonicalizationError::OutOfMemory);
    560          }
    561        }
    562      }
    563    }
    564  }
    565 
    566  // We can keep the previous extension when canonicalization didn't modify it.
    567  if (static_cast<Span<const char>>(sb) != extension) {
    568    // Otherwise replace the previous extension with the canonical extension.
    569    UniqueChars canonical = DuplicateStringToUniqueChars(sb);
    570    if (!canonical) {
    571      return Err(CanonicalizationError::OutOfMemory);
    572    }
    573    aUnicodeExtension = std::move(canonical);
    574  }
    575 
    576  return Ok();
    577 }
    578 
    579 template <class Buffer>
    580 static bool LocaleToString(const Locale& aTag, Buffer& aBuffer) {
    581  auto appendSubtag = [&aBuffer](const auto& subtag) {
    582    auto span = subtag.Span();
    583    MOZ_ASSERT(!span.empty());
    584    return aBuffer.append(span.data(), span.size());
    585  };
    586 
    587  auto appendSubtagSpan = [&aBuffer](Span<const char> subtag) {
    588    MOZ_ASSERT(!subtag.empty());
    589    return aBuffer.append(subtag.data(), subtag.size());
    590  };
    591 
    592  auto appendSubtags = [&aBuffer, &appendSubtagSpan](const auto& subtags) {
    593    for (const auto& subtag : subtags) {
    594      if (!aBuffer.append('-') || !appendSubtagSpan(subtag)) {
    595        return false;
    596      }
    597    }
    598    return true;
    599  };
    600 
    601  // Append the language subtag.
    602  if (!appendSubtag(aTag.Language())) {
    603    return false;
    604  }
    605 
    606  // Append the script subtag if present.
    607  if (aTag.Script().Present()) {
    608    if (!aBuffer.append('-') || !appendSubtag(aTag.Script())) {
    609      return false;
    610    }
    611  }
    612 
    613  // Append the region subtag if present.
    614  if (aTag.Region().Present()) {
    615    if (!aBuffer.append('-') || !appendSubtag(aTag.Region())) {
    616      return false;
    617    }
    618  }
    619 
    620  // Append the variant subtags if present.
    621  if (!appendSubtags(aTag.Variants())) {
    622    return false;
    623  }
    624 
    625  // Append the extensions subtags if present.
    626  if (!appendSubtags(aTag.Extensions())) {
    627    return false;
    628  }
    629 
    630  // Append the private-use subtag if present.
    631  if (auto privateuse = aTag.PrivateUse()) {
    632    if (!aBuffer.append('-') || !appendSubtagSpan(privateuse.value())) {
    633      return false;
    634    }
    635  }
    636 
    637  return true;
    638 }
    639 
    640 /**
    641 * CanonicalizeTransformExtension
    642 *
    643 * Canonical form per <https://unicode.org/reports/tr35/#BCP47_T_Extension>:
    644 *
    645 * - These subtags are all in lowercase (that is the canonical casing for these
    646 *   subtags), [...].
    647 *
    648 * And per
    649 * <https://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers>:
    650 *
    651 * - All keywords and tfields are sorted by alphabetical order of their keys,
    652 *   within their respective extensions.
    653 */
    654 Result<Ok, Locale::CanonicalizationError>
    655 Locale::CanonicalizeTransformExtension(UniqueChars& aTransformExtension) {
    656  Span<const char> extension = MakeStringSpan(aTransformExtension.get());
    657  MOZ_ASSERT(extension[0] == 't');
    658  MOZ_ASSERT(extension[1] == '-');
    659  MOZ_ASSERT(IsStructurallyValidExtensionTag(extension));
    660 
    661  Locale tag;
    662  LocaleParser::TFieldVector fields;
    663 
    664  using TField = LocaleParser::TFieldVector::ElementType;
    665 
    666  if (LocaleParser::ParseTransformExtension(extension, tag, fields).isErr()) {
    667    MOZ_ASSERT_UNREACHABLE("unexpected invalid transform extension subtag");
    668    return Err(CanonicalizationError::InternalError);
    669  }
    670 
    671  auto tfieldLess = [extension](const TField& a, const TField& b) {
    672    auto astr = extension.Subspan(a.Begin(), TransformKeyLength);
    673    auto bstr = extension.Subspan(b.Begin(), TransformKeyLength);
    674    return astr < bstr;
    675  };
    676 
    677  // All tfields are sorted by alphabetical order of their keys.
    678  if (fields.length() > 1) {
    679    std::stable_sort(fields.begin(), fields.end(), tfieldLess);
    680  }
    681 
    682  Vector<char, 32> sb;
    683  if (!sb.append('t')) {
    684    return Err(CanonicalizationError::OutOfMemory);
    685  }
    686 
    687  // Append the language subtag if present.
    688  //
    689  // Replace aliases in tlang per
    690  // <https://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers>.
    691  if (tag.Language().Present()) {
    692    if (!sb.append('-')) {
    693      return Err(CanonicalizationError::OutOfMemory);
    694    }
    695 
    696    MOZ_TRY(tag.CanonicalizeBaseName());
    697 
    698    // The canonical case for Transform extensions is lowercase per
    699    // <https://unicode.org/reports/tr35/#BCP47_T_Extension>. Convert the two
    700    // subtags which don't use lowercase for their canonical syntax.
    701    tag.mScript.ToLowerCase();
    702    tag.mRegion.ToLowerCase();
    703 
    704    if (!LocaleToString(tag, sb)) {
    705      return Err(CanonicalizationError::OutOfMemory);
    706    }
    707  }
    708 
    709  static constexpr size_t TransformKeyWithSepLength = TransformKeyLength + 1;
    710 
    711  using StringSpan = Span<const char>;
    712 
    713  // Append all fields.
    714  //
    715  // UTS 35, 3.2.1 specifies:
    716  // - Any type or tfield value "true" is removed.
    717  //
    718  // But the `tvalue` subtag is mandatory in `tfield: tkey tvalue`, so ignore
    719  // this apparently invalid part of the UTS 35 specification and simply
    720  // append all `tfield` subtags.
    721  for (const auto& field : fields) {
    722    if (!sb.append('-')) {
    723      return Err(CanonicalizationError::OutOfMemory);
    724    }
    725 
    726    StringSpan span = extension.Subspan(field.Begin(), field.Length());
    727    StringSpan key = span.To(TransformKeyLength);
    728    StringSpan value = span.From(TransformKeyWithSepLength);
    729 
    730    // Search if there's a replacement for the current transform keyword.
    731    if (const char* replacement = ReplaceTransformExtensionType(key, value)) {
    732      if (!AppendSpan(sb, span.To(TransformKeyWithSepLength))) {
    733        return Err(CanonicalizationError::OutOfMemory);
    734      }
    735      if (!AppendSpan(sb, MakeStringSpan(replacement))) {
    736        return Err(CanonicalizationError::OutOfMemory);
    737      }
    738    } else {
    739      if (!AppendSpan(sb, span)) {
    740        return Err(CanonicalizationError::OutOfMemory);
    741      }
    742    }
    743  }
    744 
    745  // We can keep the previous extension when canonicalization didn't modify it.
    746  if (static_cast<Span<const char>>(sb) != extension) {
    747    // Otherwise replace the previous extension with the canonical extension.
    748    UniqueChars canonical = DuplicateStringToUniqueChars(sb);
    749    if (!canonical) {
    750      return Err(CanonicalizationError::OutOfMemory);
    751    }
    752    aTransformExtension = std::move(canonical);
    753  }
    754 
    755  return Ok();
    756 }
    757 
    758 // Zero-terminated ICU Locale ID.
    759 using LocaleId =
    760    Vector<char, LanguageLength + 1 + ScriptLength + 1 + RegionLength + 1>;
    761 
    762 enum class LikelySubtags : bool { Add, Remove };
    763 
    764 // Return true iff the locale is already maximized resp. minimized.
    765 static bool HasLikelySubtags(LikelySubtags aLikelySubtags, const Locale& aTag) {
    766  // The locale is already maximized if the language, script, and region
    767  // subtags are present and no placeholder subtags ("und", "Zzzz", "ZZ") are
    768  // used.
    769  if (aLikelySubtags == LikelySubtags::Add) {
    770    return !aTag.Language().EqualTo("und") &&
    771           (aTag.Script().Present() && !aTag.Script().EqualTo("Zzzz")) &&
    772           (aTag.Region().Present() && !aTag.Region().EqualTo("ZZ"));
    773  }
    774 
    775  // The locale is already minimized if it only contains a language
    776  // subtag whose value is not the placeholder value "und".
    777  return !aTag.Language().EqualTo("und") && aTag.Script().Missing() &&
    778         aTag.Region().Missing();
    779 }
    780 
    781 // Create an ICU locale ID from the given locale.
    782 static bool CreateLocaleForLikelySubtags(const Locale& aTag,
    783                                         LocaleId& aLocale) {
    784  MOZ_ASSERT(aLocale.length() == 0);
    785 
    786  auto appendSubtag = [&aLocale](const auto& subtag) {
    787    auto span = subtag.Span();
    788    MOZ_ASSERT(!span.empty());
    789    return aLocale.append(span.data(), span.size());
    790  };
    791 
    792  // Append the language subtag.
    793  if (!appendSubtag(aTag.Language())) {
    794    return false;
    795  }
    796 
    797  // Append the script subtag if present.
    798  if (aTag.Script().Present()) {
    799    if (!aLocale.append('_') || !appendSubtag(aTag.Script())) {
    800      return false;
    801    }
    802  }
    803 
    804  // Append the region subtag if present.
    805  if (aTag.Region().Present()) {
    806    if (!aLocale.append('_') || !appendSubtag(aTag.Region())) {
    807      return false;
    808    }
    809  }
    810 
    811  // Zero-terminated for use with ICU.
    812  return aLocale.append('\0');
    813 }
    814 
    815 static ICUError ParserErrorToICUError(LocaleParser::ParserError aErr) {
    816  using ParserError = LocaleParser::ParserError;
    817 
    818  switch (aErr) {
    819    case ParserError::NotParseable:
    820      return ICUError::InternalError;
    821    case ParserError::OutOfMemory:
    822      return ICUError::OutOfMemory;
    823  }
    824  MOZ_CRASH("Unexpected parser error");
    825 }
    826 
    827 static ICUError CanonicalizationErrorToICUError(
    828    Locale::CanonicalizationError aErr) {
    829  using CanonicalizationError = Locale::CanonicalizationError;
    830 
    831  switch (aErr) {
    832    case CanonicalizationError::DuplicateVariant:
    833    case CanonicalizationError::InternalError:
    834      return ICUError::InternalError;
    835    case CanonicalizationError::OutOfMemory:
    836      return ICUError::OutOfMemory;
    837  }
    838  MOZ_CRASH("Unexpected canonicalization error");
    839 }
    840 
    841 // Assign the language, script, and region subtags from an ICU locale ID.
    842 //
    843 // ICU provides |uloc_getLanguage|, |uloc_getScript|, and |uloc_getCountry| to
    844 // retrieve these subtags, but unfortunately these functions are rather slow, so
    845 // we use our own implementation.
    846 static ICUResult AssignFromLocaleId(LocaleId& aLocaleId, Locale& aTag) {
    847  // Replace the ICU locale ID separator.
    848  std::replace(aLocaleId.begin(), aLocaleId.end(), '_', '-');
    849 
    850  // ICU replaces "und" with the empty string, which means "und" becomes "" and
    851  // "und-Latn" becomes "-Latn". Handle this case separately.
    852  if (aLocaleId.empty() || aLocaleId[0] == '-') {
    853    static constexpr auto und = MakeStringSpan("und");
    854    constexpr size_t length = und.size();
    855 
    856    // Insert "und" in front of the locale ID.
    857    if (!aLocaleId.growBy(length)) {
    858      return Err(ICUError::OutOfMemory);
    859    }
    860    memmove(aLocaleId.begin() + length, aLocaleId.begin(), aLocaleId.length());
    861    memmove(aLocaleId.begin(), und.data(), length);
    862  }
    863 
    864  // Retrieve the language, script, and region subtags from the locale ID
    865  Locale localeTag;
    866  MOZ_TRY(LocaleParser::TryParseBaseName(aLocaleId, localeTag)
    867              .mapErr(ParserErrorToICUError));
    868 
    869  aTag.SetLanguage(localeTag.Language());
    870  aTag.SetScript(localeTag.Script());
    871  aTag.SetRegion(localeTag.Region());
    872 
    873  return Ok();
    874 }
    875 
    876 template <decltype(uloc_addLikelySubtags) likelySubtagsFn>
    877 static ICUResult CallLikelySubtags(const LocaleId& aLocaleId,
    878                                   LocaleId& aResult) {
    879  // Locale ID must be zero-terminated before passing it to ICU.
    880  MOZ_ASSERT(aLocaleId.back() == '\0');
    881  MOZ_ASSERT(aResult.length() == 0);
    882 
    883  // Ensure there's enough room for the result.
    884  MOZ_ALWAYS_TRUE(aResult.resize(LocaleId::InlineLength));
    885 
    886  return FillBufferWithICUCall(
    887      aResult, [&aLocaleId](char* chars, int32_t size, UErrorCode* status) {
    888        return likelySubtagsFn(aLocaleId.begin(), chars, size, status);
    889      });
    890 }
    891 
    892 // The canonical way to compute the Unicode BCP 47 locale identifier with likely
    893 // subtags is as follows:
    894 //
    895 // 1. Call uloc_forLanguageTag() to transform the locale identifer into an ICU
    896 //    locale ID.
    897 // 2. Call uloc_addLikelySubtags() to add the likely subtags to the locale ID.
    898 // 3. Call uloc_toLanguageTag() to transform the resulting locale ID back into
    899 //    a Unicode BCP 47 locale identifier.
    900 //
    901 // Since uloc_forLanguageTag() and uloc_toLanguageTag() are both kind of slow
    902 // and we know, by construction, that the input Unicode BCP 47 locale identifier
    903 // only contains valid language, script, and region subtags, we can avoid both
    904 // calls if we implement them ourselves, see CreateLocaleForLikelySubtags() and
    905 // AssignFromLocaleId(). (Where "slow" means about 50% of the execution time of
    906 // |Intl.Locale.prototype.maximize|.)
    907 static ICUResult LikelySubtags(LikelySubtags aLikelySubtags, Locale& aTag) {
    908  // Return early if the input is already maximized/minimized.
    909  if (HasLikelySubtags(aLikelySubtags, aTag)) {
    910    return Ok();
    911  }
    912 
    913  // Create the locale ID for the input argument.
    914  LocaleId locale;
    915  if (!CreateLocaleForLikelySubtags(aTag, locale)) {
    916    return Err(ICUError::OutOfMemory);
    917  }
    918 
    919  // Either add or remove likely subtags to/from the locale ID.
    920  LocaleId localeLikelySubtags;
    921  if (aLikelySubtags == LikelySubtags::Add) {
    922    MOZ_TRY(
    923        CallLikelySubtags<uloc_addLikelySubtags>(locale, localeLikelySubtags));
    924  } else {
    925    MOZ_TRY(
    926        CallLikelySubtags<uloc_minimizeSubtags>(locale, localeLikelySubtags));
    927  }
    928 
    929  // Assign the language, script, and region subtags from the locale ID.
    930  MOZ_TRY(AssignFromLocaleId(localeLikelySubtags, aTag));
    931 
    932  // Update mappings in case ICU returned a non-canonical locale.
    933  MOZ_TRY(aTag.CanonicalizeBaseName().mapErr(CanonicalizationErrorToICUError));
    934 
    935  return Ok();
    936 }
    937 
    938 ICUResult Locale::AddLikelySubtags() {
    939  return LikelySubtags(LikelySubtags::Add, *this);
    940 }
    941 
    942 ICUResult Locale::RemoveLikelySubtags() {
    943  return LikelySubtags(LikelySubtags::Remove, *this);
    944 }
    945 
    946 UniqueChars Locale::DuplicateStringToUniqueChars(const char* aStr) {
    947  size_t length = strlen(aStr) + 1;
    948  auto duplicate = MakeUnique<char[]>(length);
    949  memcpy(duplicate.get(), aStr, length);
    950  return duplicate;
    951 }
    952 
    953 UniqueChars Locale::DuplicateStringToUniqueChars(Span<const char> aStr) {
    954  size_t length = aStr.size();
    955  auto duplicate = MakeUnique<char[]>(length + 1);
    956  memcpy(duplicate.get(), aStr.data(), length);
    957  duplicate[length] = '\0';
    958  return duplicate;
    959 }
    960 
    961 size_t Locale::ToStringCapacity() const {
    962  // This is a bit awkward, the buffer class currently does not support
    963  // being resized, so we need to calculate the required size up front and
    964  // reserve it all at once.
    965  auto lengthSubtag = [](const auto& subtag) {
    966    auto span = subtag.Span();
    967    MOZ_ASSERT(!span.empty());
    968    return span.size();
    969  };
    970 
    971  auto lengthSubtagZ = [](const char* subtag) {
    972    size_t length = strlen(subtag);
    973    MOZ_ASSERT(length > 0);
    974    return length;
    975  };
    976 
    977  auto lengthSubtags = [&lengthSubtag](const auto& subtags) {
    978    size_t length = 0;
    979    for (const auto& subtag : subtags) {
    980      length += lengthSubtag(subtag) + 1;
    981    }
    982    return length;
    983  };
    984 
    985  auto lengthSubtagsZ = [&lengthSubtagZ](const auto& subtags) {
    986    size_t length = 0;
    987    for (const auto& subtag : subtags) {
    988      length += lengthSubtagZ(subtag.get()) + 1;
    989    }
    990    return length;
    991  };
    992 
    993  // First calculate required capacity
    994  size_t capacity = 0;
    995 
    996  capacity += lengthSubtag(mLanguage);
    997 
    998  if (mScript.Present()) {
    999    capacity += lengthSubtag(mScript) + 1;
   1000  }
   1001 
   1002  if (mRegion.Present()) {
   1003    capacity += lengthSubtag(mRegion) + 1;
   1004  }
   1005 
   1006  capacity += lengthSubtags(mVariants);
   1007 
   1008  capacity += lengthSubtagsZ(mExtensions);
   1009 
   1010  if (mPrivateUse.get()) {
   1011    capacity += lengthSubtagZ(mPrivateUse.get()) + 1;
   1012  }
   1013 
   1014  return capacity;
   1015 }
   1016 
   1017 size_t Locale::ToStringAppend(char* aBuffer) const {
   1018  // Current write position inside buffer.
   1019  size_t offset = 0;
   1020 
   1021  auto appendHyphen = [&offset, &aBuffer]() {
   1022    aBuffer[offset] = '-';
   1023    offset += 1;
   1024  };
   1025 
   1026  auto appendSubtag = [&offset, &aBuffer](const auto& subtag) {
   1027    auto span = subtag.Span();
   1028    memcpy(aBuffer + offset, span.data(), span.size());
   1029    offset += span.size();
   1030  };
   1031 
   1032  auto appendSubtagZ = [&offset, &aBuffer](const char* subtag) {
   1033    size_t length = strlen(subtag);
   1034    memcpy(aBuffer + offset, subtag, length);
   1035    offset += length;
   1036  };
   1037 
   1038  auto appendSubtags = [&appendHyphen, &appendSubtag](const auto& subtags) {
   1039    for (const auto& subtag : subtags) {
   1040      appendHyphen();
   1041      appendSubtag(subtag);
   1042    }
   1043  };
   1044 
   1045  auto appendSubtagsZ = [&appendHyphen, &appendSubtagZ](const auto& subtags) {
   1046    for (const auto& subtag : subtags) {
   1047      appendHyphen();
   1048      appendSubtagZ(subtag.get());
   1049    }
   1050  };
   1051 
   1052  // Append the language subtag.
   1053  appendSubtag(mLanguage);
   1054 
   1055  // Append the script subtag if present.
   1056  if (mScript.Present()) {
   1057    appendHyphen();
   1058    appendSubtag(mScript);
   1059  }
   1060 
   1061  // Append the region subtag if present.
   1062  if (mRegion.Present()) {
   1063    appendHyphen();
   1064    appendSubtag(mRegion);
   1065  }
   1066 
   1067  // Append the variant subtags if present.
   1068  appendSubtags(mVariants);
   1069 
   1070  // Append the extensions subtags if present.
   1071  appendSubtagsZ(mExtensions);
   1072 
   1073  // Append the private-use subtag if present.
   1074  if (mPrivateUse.get()) {
   1075    appendHyphen();
   1076    appendSubtagZ(mPrivateUse.get());
   1077  }
   1078 
   1079  return offset;
   1080 }
   1081 
   1082 LocaleParser::Token LocaleParser::NextToken() {
   1083  MOZ_ASSERT(mIndex <= mLength + 1, "called after 'None' token was read");
   1084 
   1085  TokenKind kind = TokenKind::None;
   1086  size_t tokenLength = 0;
   1087  for (size_t i = mIndex; i < mLength; i++) {
   1088    // UTS 35, section 3.1.
   1089    // alpha = [A-Z a-z] ;
   1090    // digit = [0-9] ;
   1091    char c = CharAt(i);
   1092    if (IsAsciiAlpha(c)) {
   1093      kind |= TokenKind::Alpha;
   1094    } else if (IsAsciiDigit(c)) {
   1095      kind |= TokenKind::Digit;
   1096    } else if (c == '-' && i > mIndex && i + 1 < mLength) {
   1097      break;
   1098    } else {
   1099      return {TokenKind::Error, 0, 0};
   1100    }
   1101    tokenLength += 1;
   1102  }
   1103 
   1104  Token token{kind, mIndex, tokenLength};
   1105  mIndex += tokenLength + 1;
   1106  return token;
   1107 }
   1108 
   1109 UniqueChars LocaleParser::Chars(size_t aIndex, size_t aLength) const {
   1110  // Add +1 to null-terminate the string.
   1111  auto chars = MakeUnique<char[]>(aLength + 1);
   1112  char* dest = chars.get();
   1113  std::copy_n(mLocale + aIndex, aLength, dest);
   1114  dest[aLength] = '\0';
   1115  return chars;
   1116 }
   1117 
   1118 // Parse the `unicode_language_id` production.
   1119 //
   1120 // unicode_language_id = unicode_language_subtag
   1121 //                       (sep unicode_script_subtag)?
   1122 //                       (sep unicode_region_subtag)?
   1123 //                       (sep unicode_variant_subtag)* ;
   1124 //
   1125 // sep                 = "-"
   1126 //
   1127 // Note: Unicode CLDR locale identifier backward compatibility extensions
   1128 //       removed from `unicode_language_id`.
   1129 //
   1130 // |tok| is the current token from |ts|.
   1131 //
   1132 // All subtags will be added unaltered to |tag|, without canonicalizing their
   1133 // case or, in the case of variant subtags, detecting and rejecting duplicate
   1134 // variants. Users must subsequently |CanonicalizeBaseName| to perform these
   1135 // actions.
   1136 //
   1137 // Do not use this function directly: use |ParseBaseName| or
   1138 // |ParseTlangFromTransformExtension| instead.
   1139 Result<Ok, LocaleParser::ParserError> LocaleParser::InternalParseBaseName(
   1140    LocaleParser& aLocaleParser, Locale& aTag, Token& aTok) {
   1141  if (aLocaleParser.IsLanguage(aTok)) {
   1142    aLocaleParser.CopyChars(aTok, aTag.mLanguage);
   1143 
   1144    aTok = aLocaleParser.NextToken();
   1145  } else {
   1146    // The language subtag is mandatory.
   1147    return Err(ParserError::NotParseable);
   1148  }
   1149 
   1150  if (aLocaleParser.IsScript(aTok)) {
   1151    aLocaleParser.CopyChars(aTok, aTag.mScript);
   1152 
   1153    aTok = aLocaleParser.NextToken();
   1154  }
   1155 
   1156  if (aLocaleParser.IsRegion(aTok)) {
   1157    aLocaleParser.CopyChars(aTok, aTag.mRegion);
   1158 
   1159    aTok = aLocaleParser.NextToken();
   1160  }
   1161 
   1162  auto& variants = aTag.mVariants;
   1163  MOZ_ASSERT(variants.length() == 0);
   1164  while (aLocaleParser.IsVariant(aTok)) {
   1165    VariantSubtag variant{};
   1166    aLocaleParser.CopyChars(aTok, variant);
   1167    if (!variants.append(variant)) {
   1168      return Err(ParserError::OutOfMemory);
   1169    }
   1170 
   1171    aTok = aLocaleParser.NextToken();
   1172  }
   1173 
   1174  return Ok();
   1175 }
   1176 
   1177 Result<Ok, LocaleParser::ParserError> LocaleParser::TryParse(
   1178    mozilla::Span<const char> aLocale, Locale& aTag) {
   1179  // |aTag| must be a new, empty Locale.
   1180  MOZ_ASSERT(aTag.Language().Missing());
   1181  MOZ_ASSERT(aTag.Script().Missing());
   1182  MOZ_ASSERT(aTag.Region().Missing());
   1183  MOZ_ASSERT(aTag.Variants().empty());
   1184  MOZ_ASSERT(aTag.Extensions().empty());
   1185  MOZ_ASSERT(aTag.PrivateUse().isNothing());
   1186 
   1187  // unicode_locale_id = unicode_language_id
   1188  //                     extensions*
   1189  //                     pu_extensions? ;
   1190 
   1191  LocaleParser ts(aLocale);
   1192  Token tok = ts.NextToken();
   1193 
   1194  MOZ_TRY(ParseBaseName(ts, aTag, tok));
   1195 
   1196  // extensions = unicode_locale_extensions
   1197  //            | transformed_extensions
   1198  //            | other_extensions ;
   1199 
   1200  // Bit set of seen singletons.
   1201  uint64_t seenSingletons = 0;
   1202 
   1203  auto& extensions = aTag.mExtensions;
   1204  while (ts.IsExtensionStart(tok)) {
   1205    char singleton = ts.SingletonKey(tok);
   1206 
   1207    // Reject the input if a duplicate singleton was found.
   1208    uint64_t hash = 1ULL << (AsciiAlphanumericToNumber(singleton) + 1);
   1209    if (seenSingletons & hash) {
   1210      return Err(ParserError::NotParseable);
   1211    }
   1212    seenSingletons |= hash;
   1213 
   1214    Token start = tok;
   1215    tok = ts.NextToken();
   1216 
   1217    // We'll check for missing non-singleton subtags after this block by
   1218    // comparing |startValue| with the then-current position.
   1219    size_t startValue = tok.Index();
   1220 
   1221    if (singleton == 'u') {
   1222      while (ts.IsUnicodeExtensionPart(tok)) {
   1223        tok = ts.NextToken();
   1224      }
   1225    } else if (singleton == 't') {
   1226      // transformed_extensions = sep [tT]
   1227      //                          ((sep tlang (sep tfield)*)
   1228      //                           | (sep tfield)+) ;
   1229 
   1230      // tlang = unicode_language_subtag
   1231      //         (sep unicode_script_subtag)?
   1232      //         (sep unicode_region_subtag)?
   1233      //         (sep unicode_variant_subtag)* ;
   1234      if (ts.IsLanguage(tok)) {
   1235        tok = ts.NextToken();
   1236 
   1237        if (ts.IsScript(tok)) {
   1238          tok = ts.NextToken();
   1239        }
   1240 
   1241        if (ts.IsRegion(tok)) {
   1242          tok = ts.NextToken();
   1243        }
   1244 
   1245        while (ts.IsVariant(tok)) {
   1246          tok = ts.NextToken();
   1247        }
   1248      }
   1249 
   1250      // tfield = tkey tvalue;
   1251      while (ts.IsTransformExtensionKey(tok)) {
   1252        tok = ts.NextToken();
   1253 
   1254        size_t startTValue = tok.Index();
   1255        while (ts.IsTransformExtensionPart(tok)) {
   1256          tok = ts.NextToken();
   1257        }
   1258 
   1259        // `tfield` requires at least one `tvalue`.
   1260        if (tok.Index() <= startTValue) {
   1261          return Err(ParserError::NotParseable);
   1262        }
   1263      }
   1264    } else {
   1265      while (ts.IsOtherExtensionPart(tok)) {
   1266        tok = ts.NextToken();
   1267      }
   1268    }
   1269 
   1270    // Singletons must be followed by a non-singleton subtag, "en-a-b" is not
   1271    // allowed.
   1272    if (tok.Index() <= startValue) {
   1273      return Err(ParserError::NotParseable);
   1274    }
   1275 
   1276    UniqueChars extension = ts.Extension(start, tok);
   1277    if (!extensions.append(std::move(extension))) {
   1278      return Err(ParserError::OutOfMemory);
   1279    }
   1280  }
   1281 
   1282  // Trailing `pu_extension` component of the `unicode_locale_id` production.
   1283  if (ts.IsPrivateUseStart(tok)) {
   1284    Token start = tok;
   1285    tok = ts.NextToken();
   1286 
   1287    size_t startValue = tok.Index();
   1288    while (ts.IsPrivateUsePart(tok)) {
   1289      tok = ts.NextToken();
   1290    }
   1291 
   1292    // There must be at least one subtag after the "-x-".
   1293    if (tok.Index() <= startValue) {
   1294      return Err(ParserError::NotParseable);
   1295    }
   1296 
   1297    UniqueChars privateUse = ts.Extension(start, tok);
   1298    aTag.mPrivateUse = std::move(privateUse);
   1299  }
   1300 
   1301  if (!tok.IsNone()) {
   1302    return Err(ParserError::NotParseable);
   1303  }
   1304 
   1305  return Ok();
   1306 }
   1307 
   1308 Result<Ok, LocaleParser::ParserError> LocaleParser::TryParseBaseName(
   1309    Span<const char> aLocale, Locale& aTag) {
   1310  // |aTag| must be a new, empty Locale.
   1311  MOZ_ASSERT(aTag.Language().Missing());
   1312  MOZ_ASSERT(aTag.Script().Missing());
   1313  MOZ_ASSERT(aTag.Region().Missing());
   1314  MOZ_ASSERT(aTag.Variants().empty());
   1315  MOZ_ASSERT(aTag.Extensions().empty());
   1316  MOZ_ASSERT(aTag.PrivateUse().isNothing());
   1317 
   1318  LocaleParser ts(aLocale);
   1319  Token tok = ts.NextToken();
   1320 
   1321  MOZ_TRY(ParseBaseName(ts, aTag, tok));
   1322  if (!tok.IsNone()) {
   1323    return Err(ParserError::NotParseable);
   1324  }
   1325 
   1326  return Ok();
   1327 }
   1328 
   1329 // Parse |aExtension|, which must be a valid `transformed_extensions` subtag,
   1330 // and fill |aTag| and |aFields| from the `tlang` and `tfield` components.
   1331 Result<Ok, LocaleParser::ParserError> LocaleParser::ParseTransformExtension(
   1332    Span<const char> aExtension, Locale& aTag, TFieldVector& aFields) {
   1333  LocaleParser ts(aExtension);
   1334  Token tok = ts.NextToken();
   1335 
   1336  if (!ts.IsExtensionStart(tok) || ts.SingletonKey(tok) != 't') {
   1337    return Err(ParserError::NotParseable);
   1338  }
   1339 
   1340  tok = ts.NextToken();
   1341 
   1342  if (tok.IsNone()) {
   1343    return Err(ParserError::NotParseable);
   1344  }
   1345 
   1346  if (ts.IsLanguage(tok)) {
   1347    // We're parsing a possible `tlang` in a known-valid transform extension, so
   1348    // use the special-purpose function that takes advantage of this to compute
   1349    // lowercased |tag| contents in an optimal manner.
   1350    MOZ_TRY(ParseTlangInTransformExtension(ts, aTag, tok));
   1351 
   1352    // After `tlang` we must have a `tfield` and its `tkey`, or we're at the end
   1353    // of the transform extension.
   1354    MOZ_ASSERT(ts.IsTransformExtensionKey(tok) || tok.IsNone());
   1355  } else {
   1356    // If there's no `tlang` subtag, at least one `tfield` must be present.
   1357    MOZ_ASSERT(ts.IsTransformExtensionKey(tok));
   1358  }
   1359 
   1360  // Trailing `tfield` subtags. (Any other trailing subtags are an error,
   1361  // because we're guaranteed to only see a valid tranform extension here.)
   1362  while (ts.IsTransformExtensionKey(tok)) {
   1363    size_t begin = tok.Index();
   1364    tok = ts.NextToken();
   1365 
   1366    size_t startTValue = tok.Index();
   1367    while (ts.IsTransformExtensionPart(tok)) {
   1368      tok = ts.NextToken();
   1369    }
   1370 
   1371    // `tfield` requires at least one `tvalue`.
   1372    if (tok.Index() <= startTValue) {
   1373      return Err(ParserError::NotParseable);
   1374    }
   1375 
   1376    size_t length = tok.Index() - 1 - begin;
   1377    if (!aFields.emplaceBack(begin, length)) {
   1378      return Err(ParserError::OutOfMemory);
   1379    }
   1380  }
   1381 
   1382  if (!tok.IsNone()) {
   1383    return Err(ParserError::NotParseable);
   1384  }
   1385 
   1386  return Ok();
   1387 }
   1388 
   1389 // Parse |aExtension|, which must be a valid `unicode_locale_extensions` subtag,
   1390 // and fill |aAttributes| and |aKeywords| from the `attribute` and `keyword`
   1391 // components.
   1392 Result<Ok, LocaleParser::ParserError> LocaleParser::ParseUnicodeExtension(
   1393    Span<const char> aExtension, AttributesVector& aAttributes,
   1394    KeywordsVector& aKeywords) {
   1395  LocaleParser ts(aExtension);
   1396  Token tok = ts.NextToken();
   1397 
   1398  // unicode_locale_extensions = sep [uU] ((sep keyword)+ |
   1399  //                                       (sep attribute)+ (sep keyword)*) ;
   1400 
   1401  if (!ts.IsExtensionStart(tok) || ts.SingletonKey(tok) != 'u') {
   1402    return Err(ParserError::NotParseable);
   1403  }
   1404 
   1405  tok = ts.NextToken();
   1406 
   1407  if (tok.IsNone()) {
   1408    return Err(ParserError::NotParseable);
   1409  }
   1410 
   1411  while (ts.IsUnicodeExtensionAttribute(tok)) {
   1412    if (!aAttributes.emplaceBack(tok.Index(), tok.Length())) {
   1413      return Err(ParserError::OutOfMemory);
   1414    }
   1415 
   1416    tok = ts.NextToken();
   1417  }
   1418 
   1419  // keyword = key (sep type)? ;
   1420  while (ts.IsUnicodeExtensionKey(tok)) {
   1421    size_t begin = tok.Index();
   1422    tok = ts.NextToken();
   1423 
   1424    while (ts.IsUnicodeExtensionType(tok)) {
   1425      tok = ts.NextToken();
   1426    }
   1427 
   1428    if (tok.IsError()) {
   1429      return Err(ParserError::NotParseable);
   1430    }
   1431 
   1432    size_t length = tok.Index() - 1 - begin;
   1433    if (!aKeywords.emplaceBack(begin, length)) {
   1434      return Err(ParserError::OutOfMemory);
   1435    }
   1436  }
   1437 
   1438  if (!tok.IsNone()) {
   1439    return Err(ParserError::NotParseable);
   1440  }
   1441 
   1442  return Ok();
   1443 }
   1444 
   1445 Result<Ok, LocaleParser::ParserError> LocaleParser::CanParseUnicodeExtension(
   1446    Span<const char> aExtension) {
   1447  LocaleParser ts(aExtension);
   1448  Token tok = ts.NextToken();
   1449 
   1450  // unicode_locale_extensions = sep [uU] ((sep keyword)+ |
   1451  //                                       (sep attribute)+ (sep keyword)*) ;
   1452 
   1453  if (!ts.IsExtensionStart(tok) || ts.SingletonKey(tok) != 'u') {
   1454    return Err(ParserError::NotParseable);
   1455  }
   1456 
   1457  tok = ts.NextToken();
   1458 
   1459  if (tok.IsNone()) {
   1460    return Err(ParserError::NotParseable);
   1461  }
   1462 
   1463  while (ts.IsUnicodeExtensionAttribute(tok)) {
   1464    tok = ts.NextToken();
   1465  }
   1466 
   1467  // keyword = key (sep type)? ;
   1468  while (ts.IsUnicodeExtensionKey(tok)) {
   1469    tok = ts.NextToken();
   1470 
   1471    while (ts.IsUnicodeExtensionType(tok)) {
   1472      tok = ts.NextToken();
   1473    }
   1474 
   1475    if (tok.IsError()) {
   1476      return Err(ParserError::NotParseable);
   1477    }
   1478  }
   1479 
   1480  if (!tok.IsNone()) {
   1481    return Err(ParserError::OutOfMemory);
   1482  }
   1483 
   1484  return Ok();
   1485 }
   1486 
   1487 Result<Ok, LocaleParser::ParserError>
   1488 LocaleParser::CanParseUnicodeExtensionType(Span<const char> aUnicodeType) {
   1489  MOZ_ASSERT(!aUnicodeType.empty(), "caller must exclude empty strings");
   1490 
   1491  LocaleParser ts(aUnicodeType);
   1492  Token tok = ts.NextToken();
   1493 
   1494  while (ts.IsUnicodeExtensionType(tok)) {
   1495    tok = ts.NextToken();
   1496  }
   1497 
   1498  if (!tok.IsNone()) {
   1499    return Err(ParserError::NotParseable);
   1500  }
   1501 
   1502  return Ok();
   1503 }
   1504 
   1505 }  // namespace mozilla::intl
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE