tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

make_intl_data.py (134274B)


      1 #!/usr/bin/env python
      2 #
      3 # This Source Code Form is subject to the terms of the Mozilla Public
      4 # License, v. 2.0. If a copy of the MPL was not distributed with this
      5 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
      6 
      7 """Usage:
      8 make_intl_data.py langtags [cldr_common.zip]
      9 make_intl_data.py tzdata
     10 make_intl_data.py currency
     11 make_intl_data.py units
     12 make_intl_data.py numbering
     13 
     14 
     15 Target "langtags":
     16 This script extracts information about 1) mappings between deprecated and
     17 current Unicode BCP 47 locale identifiers, and 2) deprecated and current
     18 BCP 47 Unicode extension value from CLDR, and converts it to C++ mapping
     19 code in intl/components/LocaleGenerated.cpp. The code is used in
     20 intl/components/Locale.cpp.
     21 
     22 
     23 Target "tzdata":
     24 This script computes which time zone informations are not up-to-date in ICU
     25 and provides the necessary mappings to workaround this problem.
     26 https://ssl.icu-project.org/trac/ticket/12044
     27 
     28 
     29 Target "currency":
     30 Generates the mapping from currency codes to decimal digits used for them.
     31 
     32 
     33 Target "units":
     34 Generate source and test files using the list of so-called "sanctioned unit
     35 identifiers" and verifies that the ICU data filter includes these units.
     36 
     37 
     38 Target "numbering":
     39 Generate source and test files using the list of numbering systems with
     40 simple digit mappings and verifies that it's in sync with ICU/CLDR.
     41 """
     42 
     43 import io
     44 import json
     45 import os
     46 import re
     47 import tarfile
     48 import tempfile
     49 from contextlib import closing
     50 from functools import partial, total_ordering
     51 from itertools import chain, filterfalse, groupby, tee, zip_longest
     52 from operator import attrgetter, itemgetter
     53 from urllib.parse import urlsplit
     54 from urllib.request import Request as UrlRequest
     55 from urllib.request import urlopen
     56 from zipfile import ZipFile
     57 
     58 import yaml
     59 
     60 
     61 # From https://docs.python.org/3/library/itertools.html
     62 def grouper(iterable, n, fillvalue=None):
     63    "Collect data into fixed-length chunks or blocks"
     64    # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx"
     65    args = [iter(iterable)] * n
     66    return zip_longest(*args, fillvalue=fillvalue)
     67 
     68 
     69 def writeMappingHeader(println, description, source, url):
     70    if type(description) is not list:
     71        description = [description]
     72    for desc in description:
     73        println(f"// {desc}")
     74    println(f"// Derived from {source}.")
     75    println(f"// {url}")
     76 
     77 
     78 def writeMappingsVar(println, mapping, name, description, source, url):
     79    """Writes a variable definition with a mapping table.
     80 
     81    Writes the contents of dictionary |mapping| through the |println|
     82    function with the given variable name and a comment with description,
     83    fileDate, and URL.
     84    """
     85    println("")
     86    writeMappingHeader(println, description, source, url)
     87    println(f"var {name} = {{")
     88    for key, value in sorted(mapping.items(), key=itemgetter(0)):
     89        println(f'    "{key}": "{value}",')
     90    println("};")
     91 
     92 
     93 def writeMappingsBinarySearch(
     94    println,
     95    fn_name,
     96    type_name,
     97    name,
     98    validate_fn,
     99    validate_case_fn,
    100    mappings,
    101    tag_maxlength,
    102    description,
    103    source,
    104    url,
    105 ):
    106    """Emit code to perform a binary search on language tag subtags.
    107 
    108    Uses the contents of |mapping|, which can either be a dictionary or set,
    109    to emit a mapping function to find subtag replacements.
    110    """
    111    println("")
    112    writeMappingHeader(println, description, source, url)
    113    println(
    114        f"""
    115 bool mozilla::intl::Locale::{fn_name}({type_name} {name}) {{
    116  MOZ_ASSERT({validate_fn}({name}.Span()));
    117  MOZ_ASSERT({validate_case_fn}({name}.Span()));
    118 """.strip()
    119    )
    120    writeMappingsBinarySearchBody(println, name, name, mappings, tag_maxlength)
    121 
    122    println(
    123        """
    124 }""".lstrip("\n")
    125    )
    126 
    127 
    128 def writeMappingsBinarySearchBody(
    129    println, source_name, target_name, mappings, tag_maxlength
    130 ):
    131    def write_array(subtags, name, length, fixed):
    132        if fixed:
    133            println(f"    static const char {name}[{len(subtags)}][{length + 1}] = {{")
    134        else:
    135            println(f"    static const char* {name}[{len(subtags)}] = {{")
    136 
    137        # Group in pairs of ten to not exceed the 80 line column limit.
    138        for entries in grouper(subtags, 10):
    139            entries = (
    140                f'"{tag}"'.rjust(length + 2) for tag in entries if tag is not None
    141            )
    142            println("      {},".format(", ".join(entries)))
    143 
    144        println("    };")
    145 
    146    trailing_return = True
    147 
    148    # Sort the subtags by length. That enables using an optimized comparator
    149    # for the binary search, which only performs a single |memcmp| for multiple
    150    # of two subtag lengths.
    151    mappings_keys = mappings.keys() if type(mappings) is dict else mappings
    152    for length, subtags in groupby(sorted(mappings_keys, key=len), len):
    153        # Omit the length check if the current length is the maximum length.
    154        if length != tag_maxlength:
    155            println(
    156                f"""
    157  if ({source_name}.Length() == {length}) {{
    158 """.rstrip("\n")
    159            )
    160        else:
    161            trailing_return = False
    162            println(
    163                """
    164  {
    165 """.rstrip("\n")
    166            )
    167 
    168        # The subtags need to be sorted for binary search to work.
    169        subtags = sorted(subtags)
    170 
    171        def equals(subtag):
    172            return f"""{source_name}.EqualTo("{subtag}")"""
    173 
    174        # Don't emit a binary search for short lists.
    175        if len(subtags) == 1:
    176            if type(mappings) is dict:
    177                println(
    178                    f"""
    179    if ({equals(subtags[0])}) {{
    180      {target_name}.Set(mozilla::MakeStringSpan("{mappings[subtags[0]]}"));
    181      return true;
    182    }}
    183    return false;
    184 """.strip("\n")
    185                )
    186            else:
    187                println(
    188                    f"""
    189    return {equals(subtags[0])};
    190 """.strip("\n")
    191                )
    192        elif len(subtags) <= 4:
    193            if type(mappings) is dict:
    194                for subtag in subtags:
    195                    println(
    196                        f"""
    197    if ({equals(subtag)}) {{
    198      {target_name}.Set("{mappings[subtag]}");
    199      return true;
    200    }}
    201 """.strip("\n")
    202                    )
    203 
    204                println(
    205                    """
    206    return false;
    207 """.strip("\n")
    208                )
    209            else:
    210                cond = (equals(subtag) for subtag in subtags)
    211                cond = (" ||\n" + " " * (4 + len("return "))).join(cond)
    212                println(
    213                    f"""
    214    return {cond};
    215 """.strip("\n")
    216                )
    217        else:
    218            write_array(subtags, source_name + "s", length, True)
    219 
    220            if type(mappings) is dict:
    221                write_array([mappings[k] for k in subtags], "aliases", length, False)
    222 
    223                println(
    224                    f"""
    225    if (const char* replacement = SearchReplacement({source_name}s, aliases, {source_name})) {{
    226      {target_name}.Set(mozilla::MakeStringSpan(replacement));
    227      return true;
    228    }}
    229    return false;
    230 """.rstrip()
    231                )
    232            else:
    233                println(
    234                    f"""
    235    return HasReplacement({source_name}s, {source_name});
    236 """.rstrip()
    237                )
    238 
    239        println(
    240            """
    241  }
    242 """.strip("\n")
    243        )
    244 
    245    if trailing_return:
    246        println(
    247            """
    248  return false;"""
    249        )
    250 
    251 
    252 def writeComplexLanguageTagMappings(
    253    println, complex_language_mappings, description, source, url
    254 ):
    255    println("")
    256    writeMappingHeader(println, description, source, url)
    257    println(
    258        """
    259 void mozilla::intl::Locale::PerformComplexLanguageMappings() {
    260  MOZ_ASSERT(IsStructurallyValidLanguageTag(Language().Span()));
    261  MOZ_ASSERT(IsCanonicallyCasedLanguageTag(Language().Span()));
    262 """.lstrip()
    263    )
    264 
    265    # Merge duplicate language entries.
    266    language_aliases = {}
    267    for deprecated_language, (language, script, region) in sorted(
    268        complex_language_mappings.items(), key=itemgetter(0)
    269    ):
    270        key = (language, script, region)
    271        if key not in language_aliases:
    272            language_aliases[key] = []
    273        else:
    274            language_aliases[key].append(deprecated_language)
    275 
    276    first_language = True
    277    for deprecated_language, (language, script, region) in sorted(
    278        complex_language_mappings.items(), key=itemgetter(0)
    279    ):
    280        key = (language, script, region)
    281        if deprecated_language in language_aliases[key]:
    282            continue
    283 
    284        if_kind = "if" if first_language else "else if"
    285        first_language = False
    286 
    287        cond = (
    288            f'Language().EqualTo("{lang}")'
    289            for lang in [deprecated_language] + language_aliases[key]
    290        )
    291        cond = (" ||\n" + " " * (2 + len(if_kind) + 2)).join(cond)
    292 
    293        println(
    294            f"""
    295  {if_kind} ({cond}) {{""".strip("\n")
    296        )
    297 
    298        println(
    299            f"""
    300    SetLanguage("{language}");""".strip("\n")
    301        )
    302 
    303        if script is not None:
    304            println(
    305                f"""
    306    if (Script().Missing()) {{
    307      SetScript("{script}");
    308    }}""".strip("\n")
    309            )
    310        if region is not None:
    311            println(
    312                f"""
    313    if (Region().Missing()) {{
    314      SetRegion("{region}");
    315    }}""".strip("\n")
    316            )
    317        println(
    318            """
    319  }""".strip("\n")
    320        )
    321 
    322    println(
    323        """
    324 }
    325 """.strip("\n")
    326    )
    327 
    328 
    329 def writeComplexRegionTagMappings(
    330    println, complex_region_mappings, description, source, url
    331 ):
    332    println("")
    333    writeMappingHeader(println, description, source, url)
    334    println(
    335        """
    336 void mozilla::intl::Locale::PerformComplexRegionMappings() {
    337  MOZ_ASSERT(IsStructurallyValidLanguageTag(Language().Span()));
    338  MOZ_ASSERT(IsCanonicallyCasedLanguageTag(Language().Span()));
    339  MOZ_ASSERT(IsStructurallyValidRegionTag(Region().Span()));
    340  MOZ_ASSERT(IsCanonicallyCasedRegionTag(Region().Span()));
    341 """.lstrip()
    342    )
    343 
    344    # |non_default_replacements| is a list and hence not hashable. Convert it
    345    # to a string to get a proper hashable value.
    346    def hash_key(default, non_default_replacements):
    347        return (default, str(sorted(str(v) for v in non_default_replacements)))
    348 
    349    # Merge duplicate region entries.
    350    region_aliases = {}
    351    for deprecated_region, (default, non_default_replacements) in sorted(
    352        complex_region_mappings.items(), key=itemgetter(0)
    353    ):
    354        key = hash_key(default, non_default_replacements)
    355        if key not in region_aliases:
    356            region_aliases[key] = []
    357        else:
    358            region_aliases[key].append(deprecated_region)
    359 
    360    first_region = True
    361    for deprecated_region, (default, non_default_replacements) in sorted(
    362        complex_region_mappings.items(), key=itemgetter(0)
    363    ):
    364        key = hash_key(default, non_default_replacements)
    365        if deprecated_region in region_aliases[key]:
    366            continue
    367 
    368        if_kind = "if" if first_region else "else if"
    369        first_region = False
    370 
    371        cond = (
    372            f'Region().EqualTo("{region}")'
    373            for region in [deprecated_region] + region_aliases[key]
    374        )
    375        cond = (" ||\n" + " " * (2 + len(if_kind) + 2)).join(cond)
    376 
    377        println(
    378            f"""
    379  {if_kind} ({cond}) {{""".strip("\n")
    380        )
    381 
    382        replacement_regions = sorted({
    383            region for (_, _, region) in non_default_replacements
    384        })
    385 
    386        first_case = True
    387        for replacement_region in replacement_regions:
    388            replacement_language_script = sorted(
    389                (language, script)
    390                for (language, script, region) in (non_default_replacements)
    391                if region == replacement_region
    392            )
    393 
    394            if_kind = "if" if first_case else "else if"
    395            first_case = False
    396 
    397            def compare_tags(language, script):
    398                if script is None:
    399                    return f'Language().EqualTo("{language}")'
    400                return f'(Language().EqualTo("{language}") && Script().EqualTo("{script}"))'
    401 
    402            cond = (
    403                compare_tags(language, script)
    404                for (language, script) in replacement_language_script
    405            )
    406            cond = (" ||\n" + " " * (4 + len(if_kind) + 2)).join(cond)
    407 
    408            println(
    409                f"""
    410    {if_kind} ({cond}) {{
    411      SetRegion("{replacement_region}");
    412    }}""".rstrip().strip("\n")
    413            )
    414 
    415        println(
    416            f"""
    417    else {{
    418      SetRegion("{default}");
    419    }}
    420  }}""".rstrip().strip("\n")
    421        )
    422 
    423    println(
    424        """
    425 }
    426 """.strip("\n")
    427    )
    428 
    429 
    430 def writeVariantTagMappings(println, variant_mappings, description, source, url):
    431    """Writes a function definition that maps variant subtags."""
    432    println(
    433        """
    434 static auto ToSpan(const mozilla::Span<const char>& aSpan) {
    435  return aSpan;
    436 }
    437 
    438 template <size_t N>
    439 static auto ToSpan(const mozilla::intl::LanguageTagSubtag<N>& aSubtag) {
    440  return aSubtag.Span();
    441 }
    442 
    443 template <typename T, typename U = T>
    444 static bool IsLessThan(const T& a, const U& b) {
    445  return ToSpan(a) < ToSpan(b);
    446 }
    447 """
    448    )
    449    writeMappingHeader(println, description, source, url)
    450    println(
    451        """
    452 bool mozilla::intl::Locale::PerformVariantMappings() {
    453  // The variant subtags need to be sorted for binary search.
    454  MOZ_ASSERT(std::is_sorted(mVariants.begin(), mVariants.end(),
    455                            IsLessThan<decltype(mVariants)::ElementType>));
    456 
    457  auto removeVariantAt = [&](size_t index) {
    458    mVariants.erase(mVariants.begin() + index);
    459  };
    460 
    461  auto insertVariantSortedIfNotPresent = [&](mozilla::Span<const char> variant) {
    462    auto* p = std::lower_bound(
    463        mVariants.begin(), mVariants.end(), variant,
    464        IsLessThan<decltype(mVariants)::ElementType, decltype(variant)>);
    465 
    466    // Don't insert the replacement when already present.
    467    if (p != mVariants.end() && p->Span() == variant) {
    468      return true;
    469    }
    470 
    471    // Insert the preferred variant in sort order.
    472    auto preferred = mozilla::intl::VariantSubtag{variant};
    473    return !!mVariants.insert(p, preferred);
    474  };
    475 
    476  for (size_t i = 0; i < mVariants.length();) {
    477    const auto& variant = mVariants[i];
    478    MOZ_ASSERT(IsCanonicallyCasedVariantTag(variant.Span()));
    479 """.lstrip()
    480    )
    481 
    482    (no_alias, with_alias) = partition(
    483        variant_mappings.items(), lambda item: item[1] is None
    484    )
    485 
    486    no_replacements = " ||\n        ".join(
    487        f"""variant.Span() == mozilla::MakeStringSpan("{deprecated_variant}")"""
    488        for (deprecated_variant, _) in sorted(no_alias, key=itemgetter(0))
    489    )
    490 
    491    println(
    492        f"""
    493    if ({no_replacements}) {{
    494      removeVariantAt(i);
    495    }}
    496 """.strip("\n")
    497    )
    498 
    499    for deprecated_variant, (type, replacement) in sorted(
    500        with_alias, key=itemgetter(0)
    501    ):
    502        println(
    503            f"""
    504    else if (variant.Span() == mozilla::MakeStringSpan("{deprecated_variant}")) {{
    505      removeVariantAt(i);
    506 """.strip("\n")
    507        )
    508 
    509        if type == "language":
    510            println(
    511                f"""
    512      SetLanguage("{replacement}");
    513 """.strip("\n")
    514            )
    515        elif type == "region":
    516            println(
    517                f"""
    518      SetRegion("{replacement}");
    519 """.strip("\n")
    520            )
    521        else:
    522            assert type == "variant"
    523            println(
    524                f"""
    525      if (!insertVariantSortedIfNotPresent(mozilla::MakeStringSpan("{replacement}"))) {{
    526        return false;
    527      }}
    528 """.strip("\n")
    529            )
    530 
    531        println(
    532            """
    533    }
    534 """.strip("\n")
    535        )
    536 
    537    println(
    538        """
    539    else {
    540      i++;
    541    }
    542  }
    543  return true;
    544 }
    545 """.strip("\n")
    546    )
    547 
    548 
    549 def writeLegacyMappingsFunction(println, legacy_mappings, description, source, url):
    550    """Writes a function definition that maps legacy language tags."""
    551    println("")
    552    writeMappingHeader(println, description, source, url)
    553    println(
    554        """\
    555 bool mozilla::intl::Locale::UpdateLegacyMappings() {
    556  // We're mapping legacy tags to non-legacy form here.
    557  // Other tags remain unchanged.
    558  //
    559  // Legacy tags are either sign language tags ("sgn") or have one or multiple
    560  // variant subtags. Therefore we can quickly exclude most tags by checking
    561  // these two subtags.
    562 
    563  MOZ_ASSERT(IsCanonicallyCasedLanguageTag(Language().Span()));
    564 
    565  if (!Language().EqualTo("sgn") && mVariants.length() == 0) {
    566    return true;
    567  }
    568 
    569 #ifdef DEBUG
    570  for (const auto& variant : Variants()) {
    571    MOZ_ASSERT(IsStructurallyValidVariantTag(variant));
    572    MOZ_ASSERT(IsCanonicallyCasedVariantTag(variant));
    573  }
    574 #endif
    575 
    576  // The variant subtags need to be sorted for binary search.
    577  MOZ_ASSERT(std::is_sorted(mVariants.begin(), mVariants.end(),
    578                            IsLessThan<decltype(mVariants)::ElementType>));
    579 
    580  auto findVariant = [this](mozilla::Span<const char> variant) {
    581    auto* p = std::lower_bound(mVariants.begin(), mVariants.end(), variant,
    582                               IsLessThan<decltype(mVariants)::ElementType,
    583                                          decltype(variant)>);
    584 
    585    if (p != mVariants.end() && p->Span() == variant) {
    586      return p;
    587    }
    588    return static_cast<decltype(p)>(nullptr);
    589  };
    590 
    591  auto insertVariantSortedIfNotPresent = [&](mozilla::Span<const char> variant) {
    592    auto* p = std::lower_bound(mVariants.begin(), mVariants.end(), variant,
    593                               IsLessThan<decltype(mVariants)::ElementType,
    594                                          decltype(variant)>);
    595 
    596    // Don't insert the replacement when already present.
    597    if (p != mVariants.end() && p->Span() == variant) {
    598      return true;
    599    }
    600 
    601    // Insert the preferred variant in sort order.
    602    auto preferred = mozilla::intl::VariantSubtag{variant};
    603    return !!mVariants.insert(p, preferred);
    604  };
    605 
    606  auto removeVariant = [&](auto* p) {
    607    size_t index = std::distance(mVariants.begin(), p);
    608    mVariants.erase(mVariants.begin() + index);
    609  };
    610 
    611  auto removeVariants = [&](auto* p, auto* q) {
    612    size_t pIndex = std::distance(mVariants.begin(), p);
    613    size_t qIndex = std::distance(mVariants.begin(), q);
    614    MOZ_ASSERT(pIndex < qIndex, "variant subtags are sorted");
    615 
    616    mVariants.erase(mVariants.begin() + qIndex);
    617    mVariants.erase(mVariants.begin() + pIndex);
    618  };"""
    619    )
    620 
    621    # Helper class for pattern matching.
    622    class AnyClass:
    623        def __eq__(self, obj):
    624            return obj is not None
    625 
    626    Any = AnyClass()
    627 
    628    # Group the mappings by language.
    629    legacy_mappings_by_language = {}
    630    for type, replacement in legacy_mappings.items():
    631        (language, _, _, _) = type
    632        legacy_mappings_by_language.setdefault(language, {})[type] = replacement
    633 
    634    # Handle the empty language case first.
    635    if None in legacy_mappings_by_language:
    636        # Get the mappings and remove them from the dict.
    637        mappings = legacy_mappings_by_language.pop(None)
    638 
    639        # This case only applies for the "hepburn-heploc" -> "alalc97"
    640        # mapping, so just inline it here.
    641        from_tag = (None, None, None, "hepburn-heploc")
    642        to_tag = (None, None, None, "alalc97")
    643 
    644        assert len(mappings) == 1
    645        assert mappings[from_tag] == to_tag
    646 
    647        println(
    648            """
    649  if (mVariants.length() >= 2) {
    650    if (auto* hepburn = findVariant(mozilla::MakeStringSpan("hepburn"))) {
    651      if (auto* heploc = findVariant(mozilla::MakeStringSpan("heploc"))) {
    652        removeVariants(hepburn, heploc);
    653 
    654        if (!insertVariantSortedIfNotPresent(mozilla::MakeStringSpan("alalc97"))) {
    655          return false;
    656        }
    657      }
    658    }
    659  }
    660 """
    661        )
    662 
    663    # Handle sign languages next.
    664    if "sgn" in legacy_mappings_by_language:
    665        mappings = legacy_mappings_by_language.pop("sgn")
    666 
    667        # Legacy sign language mappings have the form "sgn-XX" where "XX" is
    668        # some region code.
    669        assert all(type == ("sgn", None, Any, None) for type in mappings.keys())
    670 
    671        # Legacy sign languages are mapped to a single language subtag.
    672        assert all(
    673            replacement == (Any, None, None, None) for replacement in mappings.values()
    674        )
    675 
    676        println(
    677            """
    678  if (Language().EqualTo("sgn")) {
    679    if (Region().Present() && SignLanguageMapping(mLanguage, Region())) {
    680      mRegion.Set(mozilla::MakeStringSpan(""));
    681    }
    682  }
    683 """.rstrip().lstrip("\n")
    684        )
    685 
    686    # Finally handle all remaining cases.
    687 
    688    # The remaining mappings have neither script nor region subtags in the source locale.
    689    assert all(
    690        type == (Any, None, None, Any)
    691        for mappings in legacy_mappings_by_language.values()
    692        for type in mappings.keys()
    693    )
    694 
    695    # And they have neither script nor region nor variant subtags in the target locale.
    696    assert all(
    697        replacement == (Any, None, None, None)
    698        for mappings in legacy_mappings_by_language.values()
    699        for replacement in mappings.values()
    700    )
    701 
    702    # Compact the mappings table by removing empty fields.
    703    legacy_mappings_by_language = {
    704        lang: {
    705            variants: r_language
    706            for ((_, _, _, variants), (r_language, _, _, _)) in mappings.items()
    707        }
    708        for (lang, mappings) in legacy_mappings_by_language.items()
    709    }
    710 
    711    # Try to combine the remaining cases.
    712    legacy_mappings_compact = {}
    713 
    714    # Python can't hash dicts or lists, so use the string representation as the hash key.
    715    def hash_key(mappings):
    716        return str(sorted(mappings.items(), key=itemgetter(0)))
    717 
    718    for lang, mappings in sorted(
    719        legacy_mappings_by_language.items(), key=itemgetter(0)
    720    ):
    721        key = hash_key(mappings)
    722        legacy_mappings_compact.setdefault(key, []).append(lang)
    723 
    724    for langs in legacy_mappings_compact.values():
    725        language_equal_to = (
    726            f"""Language().EqualTo("{lang}")""" for lang in sorted(langs)
    727        )
    728        cond = f""" ||\n{" " * len("  else if (")}""".join(language_equal_to)
    729 
    730        println(
    731            f"""
    732  else if ({cond}) {{
    733 """.rstrip().lstrip("\n")
    734        )
    735 
    736        mappings = legacy_mappings_by_language[langs[0]]
    737 
    738        # Count the variant subtags to determine the sort order.
    739        def variant_size(m):
    740            (k, _) = m
    741            return len(k.split("-"))
    742 
    743        # Alias rules are applied by largest union size first.
    744        for size, mappings_by_size in groupby(
    745            sorted(mappings.items(), key=variant_size, reverse=True), key=variant_size
    746        ):
    747            # Convert grouper object to dict.
    748            mappings_by_size = dict(mappings_by_size)
    749 
    750            is_first = True
    751            chain_if = size == 1
    752 
    753            # Alias rules are applied in alphabetical order
    754            for variants, r_language in sorted(
    755                mappings_by_size.items(), key=itemgetter(0)
    756            ):
    757                sorted_variants = sorted(variants.split("-"))
    758                len_variants = len(sorted_variants)
    759 
    760                maybe_else = "else " if chain_if and not is_first else ""
    761                is_first = False
    762 
    763                for i, variant in enumerate(sorted_variants):
    764                    println(
    765                        f"""
    766    {"  " * i}{maybe_else}if (auto* {variant} = findVariant(mozilla::MakeStringSpan("{variant}"))) {{
    767 """.rstrip().lstrip("\n")
    768                    )
    769 
    770                indent = "  " * len_variants
    771 
    772                println(
    773                    f"""
    774    {indent}removeVariant{"s" if len_variants > 1 else ""}({", ".join(sorted_variants)});
    775    {indent}SetLanguage("{r_language}");
    776    {indent}{"return true;" if not chain_if else ""}
    777 """.rstrip().lstrip("\n")
    778                )
    779 
    780                for i in range(len_variants, 0, -1):
    781                    println(
    782                        f"""
    783    {"  " * (i - 1)}}}
    784 """.rstrip().lstrip("\n")
    785                    )
    786 
    787        println(
    788            """
    789  }
    790 """.rstrip().lstrip("\n")
    791        )
    792 
    793    println(
    794        """
    795  return true;
    796 }"""
    797    )
    798 
    799 
    800 def writeSignLanguageMappingsFunction(
    801    println, legacy_mappings, description, source, url
    802 ):
    803    """Writes a function definition that maps legacy sign language tags."""
    804    println("")
    805    writeMappingHeader(println, description, source, url)
    806    println(
    807        """\
    808 bool mozilla::intl::Locale::SignLanguageMapping(LanguageSubtag& language,
    809                                                const RegionSubtag& region) {
    810  MOZ_ASSERT(language.EqualTo("sgn"));
    811  MOZ_ASSERT(IsStructurallyValidRegionTag(region.Span()));
    812  MOZ_ASSERT(IsCanonicallyCasedRegionTag(region.Span()));
    813 """.rstrip()
    814    )
    815 
    816    region_mappings = {
    817        rg: lg
    818        for ((lang, _, rg, _), (lg, _, _, _)) in legacy_mappings.items()
    819        if lang == "sgn"
    820    }
    821 
    822    source_name = "region"
    823    target_name = "language"
    824    tag_maxlength = 3
    825    writeMappingsBinarySearchBody(
    826        println, source_name, target_name, region_mappings, tag_maxlength
    827    )
    828 
    829    println(
    830        """
    831 }""".lstrip()
    832    )
    833 
    834 
    835 def readSupplementalData(core_file):
    836    """Reads CLDR Supplemental Data and extracts information for Intl.js.
    837 
    838    Information extracted:
    839    - legacyMappings: mappings from legacy tags to preferred complete language tags
    840    - languageMappings: mappings from language subtags to preferred subtags
    841    - complexLanguageMappings: mappings from language subtags with complex rules
    842    - regionMappings: mappings from region subtags to preferred subtags
    843    - complexRegionMappings: mappings from region subtags with complex rules
    844    - variantMappings: mappings from variant subtags to preferred subtags
    845    - likelySubtags: likely subtags used for generating test data only
    846    Returns these mappings as dictionaries.
    847    """
    848    import xml.etree.ElementTree as ET
    849 
    850    # From Unicode BCP 47 locale identifier <https://unicode.org/reports/tr35/>.
    851    re_unicode_language_id = re.compile(
    852        r"""
    853        ^
    854        # unicode_language_id = unicode_language_subtag
    855        #     unicode_language_subtag = alpha{2,3} | alpha{5,8}
    856        (?P<language>[a-z]{2,3}|[a-z]{5,8})
    857 
    858        # (sep unicode_script_subtag)?
    859        #     unicode_script_subtag = alpha{4}
    860        (?:-(?P<script>[a-z]{4}))?
    861 
    862        # (sep unicode_region_subtag)?
    863        #     unicode_region_subtag = (alpha{2} | digit{3})
    864        (?:-(?P<region>([a-z]{2}|[0-9]{3})))?
    865 
    866        # (sep unicode_variant_subtag)*
    867        #     unicode_variant_subtag = (alphanum{5,8} | digit alphanum{3})
    868        (?P<variants>(-([a-z0-9]{5,8}|[0-9][a-z0-9]{3}))+)?
    869        $
    870        """,
    871        re.IGNORECASE | re.VERBOSE,
    872    )
    873 
    874    # CLDR uses "_" as the separator for some elements. Replace it with "-".
    875    def bcp47_id(cldr_id):
    876        return cldr_id.replace("_", "-")
    877 
    878    # Return the tuple (language, script, region, variants) and assert all
    879    # subtags are in canonical case.
    880    def bcp47_canonical(language, script, region, variants):
    881        # Canonical case for language subtags is lower case.
    882        assert language is None or language.lower() == language
    883 
    884        # Canonical case for script subtags is title case.
    885        assert script is None or script.title() == script
    886 
    887        # Canonical case for region subtags is upper case.
    888        assert region is None or region.upper() == region
    889 
    890        # Canonical case for variant subtags is lower case.
    891        assert variants is None or variants.lower() == variants
    892 
    893        return (language, script, region, variants[1:] if variants else None)
    894 
    895    # Language ids are interpreted as multi-maps in
    896    # <https://www.unicode.org/reports/tr35/#LocaleId_Canonicalization>.
    897    #
    898    # See UTS35, §Annex C, Definitions - 1. Multimap interpretation.
    899    def language_id_to_multimap(language_id):
    900        match = re_unicode_language_id.match(language_id)
    901        assert match is not None, (
    902            f"{language_id} invalid Unicode BCP 47 locale identifier"
    903        )
    904 
    905        canonical_language_id = bcp47_canonical(
    906            *match.group("language", "script", "region", "variants")
    907        )
    908        (language, _, _, _) = canonical_language_id
    909 
    910        # Normalize "und" language to None, but keep the rest as is.
    911        return (language if language != "und" else None,) + canonical_language_id[1:]
    912 
    913    rules = {}
    914    territory_exception_rules = {}
    915 
    916    tree = ET.parse(core_file.open("common/supplemental/supplementalMetadata.xml"))
    917 
    918    # Load the rules from supplementalMetadata.xml.
    919    #
    920    # See UTS35, §Annex C, Definitions - 2. Alias elements.
    921    # See UTS35, §Annex C, Preprocessing.
    922    for alias_name in [
    923        "languageAlias",
    924        "scriptAlias",
    925        "territoryAlias",
    926        "variantAlias",
    927    ]:
    928        for alias in tree.iterfind(".//" + alias_name):
    929            # Replace '_' by '-'.
    930            type = bcp47_id(alias.get("type"))
    931            replacement = bcp47_id(alias.get("replacement"))
    932 
    933            # Prefix with "und-".
    934            if alias_name != "languageAlias":
    935                type = "und-" + type
    936 
    937            # Discard all rules where the type is an invalid languageId.
    938            if re_unicode_language_id.match(type) is None:
    939                continue
    940 
    941            type = language_id_to_multimap(type)
    942 
    943            # Multiple, whitespace-separated territory replacements may be present.
    944            if alias_name == "territoryAlias" and " " in replacement:
    945                replacements = replacement.split(" ")
    946                replacement_list = [
    947                    language_id_to_multimap("und-" + r) for r in replacements
    948                ]
    949 
    950                assert type not in territory_exception_rules, (
    951                    f"Duplicate alias rule: {type}"
    952                )
    953 
    954                territory_exception_rules[type] = replacement_list
    955 
    956                # The first element is the default territory replacement.
    957                replacement = replacements[0]
    958 
    959            # Prefix with "und-".
    960            if alias_name != "languageAlias":
    961                replacement = "und-" + replacement
    962 
    963            replacement = language_id_to_multimap(replacement)
    964 
    965            assert type not in rules, f"Duplicate alias rule: {type}"
    966 
    967            rules[type] = replacement
    968 
    969    # Helper class for pattern matching.
    970    class AnyClass:
    971        def __eq__(self, obj):
    972            return obj is not None
    973 
    974    Any = AnyClass()
    975 
    976    modified_rules = True
    977    loop_count = 0
    978 
    979    while modified_rules:
    980        modified_rules = False
    981        loop_count += 1
    982 
    983        # UTS 35 defines that canonicalization is applied until a fixed point has
    984        # been reached. This iterative application of the canonicalization algorithm
    985        # is only needed for a relatively small set of rules, so we can precompute
    986        # the transitive closure of all rules here and then perform a single pass
    987        # when canonicalizing language tags at runtime.
    988        transitive_rules = {}
    989 
    990        # Compute the transitive closure.
    991        # Any case which currently doesn't occur in the CLDR sources isn't supported
    992        # and will lead to throwing an error.
    993        for type, replacement in rules.items():
    994            (language, script, region, variants) = type
    995            (r_language, r_script, r_region, r_variants) = replacement
    996 
    997            for i_type, i_replacement in rules.items():
    998                (i_language, i_script, i_region, i_variants) = i_type
    999                (i_r_language, i_r_script, i_r_region, i_r_variants) = i_replacement
   1000 
   1001                if i_language is not None and i_language == r_language:
   1002                    # This case currently only occurs when neither script nor region
   1003                    # subtags are present. A single variant subtags may be present
   1004                    # in |type|. And |i_type| definitely has a single variant subtag.
   1005                    # Should this ever change, update this code accordingly.
   1006                    assert type in {
   1007                        (Any, None, None, None),
   1008                        (Any, None, None, Any),
   1009                    }
   1010                    assert replacement == (Any, None, None, None)
   1011                    assert i_type == (Any, None, None, Any)
   1012                    assert i_replacement == (Any, None, None, None)
   1013 
   1014                    # This case happens for the rules
   1015                    #   "zh-guoyu -> zh",
   1016                    #   "zh-hakka -> hak", and
   1017                    #   "und-hakka -> und".
   1018                    # Given the possible input "zh-guoyu-hakka", the first rule will
   1019                    # change it to "zh-hakka", and then the second rule can be
   1020                    # applied. (The third rule isn't applied ever.)
   1021                    #
   1022                    # Let's assume there's a hypothetical rule
   1023                    #   "zh-aaaaa" -> "en"
   1024                    # And we have the input "zh-aaaaa-hakka", then "zh-aaaaa -> en"
   1025                    # is applied before "zh-hakka -> hak", because rules are sorted
   1026                    # alphabetically. That means the overall result is "en":
   1027                    # "zh-aaaaa-hakka" is first canonicalized to "en-hakka" and then
   1028                    # "hakka" is removed through the third rule.
   1029                    #
   1030                    # No current rule requires to handle this special case, so we
   1031                    # don't yet support it.
   1032                    assert variants is None or variants <= i_variants
   1033 
   1034                    # Combine all variants and remove duplicates.
   1035                    vars = set(
   1036                        i_variants.split("-")
   1037                        + (variants.split("-") if variants else [])
   1038                    )
   1039 
   1040                    # Add the variants alphabetically sorted.
   1041                    n_type = (language, None, None, "-".join(sorted(vars)))
   1042 
   1043                    assert (
   1044                        n_type not in transitive_rules
   1045                        or transitive_rules[n_type] == i_replacement
   1046                    )
   1047                    transitive_rules[n_type] = i_replacement
   1048 
   1049                    continue
   1050 
   1051                if i_script is not None and i_script == r_script:
   1052                    # This case currently doesn't occur, so we don't yet support it.
   1053                    raise ValueError(
   1054                        f"{type} -> {replacement} :: {i_type} -> {i_replacement}"
   1055                    )
   1056                if i_region is not None and i_region == r_region:
   1057                    # This case currently only applies for sign language
   1058                    # replacements. Similar to the language subtag case any other
   1059                    # combination isn't currently supported.
   1060                    assert type == (None, None, Any, None)
   1061                    assert replacement == (None, None, Any, None)
   1062                    assert i_type == ("sgn", None, Any, None)
   1063                    assert i_replacement == (Any, None, None, None)
   1064 
   1065                    n_type = ("sgn", None, region, None)
   1066 
   1067                    assert n_type not in transitive_rules
   1068                    transitive_rules[n_type] = i_replacement
   1069 
   1070                    continue
   1071 
   1072                if i_variants is not None and i_variants == r_variants:
   1073                    # This case currently doesn't occur, so we don't yet support it.
   1074                    raise ValueError(
   1075                        f"{type} -> {replacement} :: {i_type} -> {i_replacement}"
   1076                    )
   1077 
   1078        # Ensure there are no contradicting rules.
   1079        assert all(
   1080            rules[type] == replacement
   1081            for (type, replacement) in transitive_rules.items()
   1082            if type in rules
   1083        )
   1084 
   1085        # If |transitive_rules| is not a subset of |rules|, new rules will be added.
   1086        modified_rules = not (transitive_rules.keys() <= rules.keys())
   1087 
   1088        # Ensure we only have to iterate more than once for the "guoyo-{hakka,xiang}"
   1089        # case. Failing this assertion means either there's a bug when computing the
   1090        # stop condition of this loop or a new kind of legacy language tags was added.
   1091        if modified_rules and loop_count > 1:
   1092            new_rules = {k for k in transitive_rules.keys() if k not in rules}
   1093            for k in new_rules:
   1094                assert k in {
   1095                    (Any, None, None, "guoyu-hakka"),
   1096                    (Any, None, None, "guoyu-xiang"),
   1097                }
   1098 
   1099        # Merge the transitive rules.
   1100        rules.update(transitive_rules)
   1101 
   1102    # Computes the size of the union of all field value sets.
   1103    def multi_map_size(locale_id):
   1104        (language, script, region, variants) = locale_id
   1105 
   1106        return (
   1107            (1 if language is not None else 0)
   1108            + (1 if script is not None else 0)
   1109            + (1 if region is not None else 0)
   1110            + (len(variants.split("-")) if variants is not None else 0)
   1111        )
   1112 
   1113    # Dictionary of legacy mappings, contains raw rules, e.g.
   1114    # (None, None, None, "hepburn-heploc") -> (None, None, None, "alalc97").
   1115    legacy_mappings = {}
   1116 
   1117    # Dictionary of simple language subtag mappings, e.g. "in" -> "id".
   1118    language_mappings = {}
   1119 
   1120    # Dictionary of complex language subtag mappings, modifying more than one
   1121    # subtag, e.g. "sh" -> ("sr", "Latn", None) and "cnr" -> ("sr", None, "ME").
   1122    complex_language_mappings = {}
   1123 
   1124    # Dictionary of simple script subtag mappings, e.g. "Qaai" -> "Zinh".
   1125    script_mappings = {}
   1126 
   1127    # Dictionary of simple region subtag mappings, e.g. "DD" -> "DE".
   1128    region_mappings = {}
   1129 
   1130    # Dictionary of complex region subtag mappings, containing more than one
   1131    # replacement, e.g. "SU" -> ("RU", ["AM", "AZ", "BY", ...]).
   1132    complex_region_mappings = {}
   1133 
   1134    # Dictionary of aliased variant subtags to a tuple of preferred replacement
   1135    # type and replacement, e.g. "arevela" -> ("language", "hy") or
   1136    # "aaland" -> ("region", "AX") or "heploc" -> ("variant", "alalc97").
   1137    variant_mappings = {}
   1138 
   1139    # Preprocess all rules so we can perform a single lookup per subtag at runtime.
   1140    for type, replacement in rules.items():
   1141        (language, script, region, variants) = type
   1142        (r_language, r_script, r_region, r_variants) = replacement
   1143 
   1144        type_map_size = multi_map_size(type)
   1145 
   1146        # Most mappings are one-to-one and can be encoded through lookup tables.
   1147        if type_map_size == 1:
   1148            if language is not None:
   1149                assert r_language is not None, "Can't remove a language subtag"
   1150 
   1151                # We don't yet support this case.
   1152                assert r_variants is None, (
   1153                    f"Unhandled variant replacement in language alias: {replacement}"
   1154                )
   1155 
   1156                if replacement == (Any, None, None, None):
   1157                    language_mappings[language] = r_language
   1158                else:
   1159                    complex_language_mappings[language] = replacement[:-1]
   1160            elif script is not None:
   1161                # We don't support removing script subtags.
   1162                assert r_script is not None, (
   1163                    f"Can't remove a script subtag: {replacement}"
   1164                )
   1165 
   1166                # We only support one-to-one script mappings for now.
   1167                assert replacement == (
   1168                    None,
   1169                    Any,
   1170                    None,
   1171                    None,
   1172                ), f"Unhandled replacement in script alias: {replacement}"
   1173 
   1174                script_mappings[script] = r_script
   1175            elif region is not None:
   1176                # We don't support removing region subtags.
   1177                assert r_region is not None, (
   1178                    f"Can't remove a region subtag: {replacement}"
   1179                )
   1180 
   1181                # We only support one-to-one region mappings for now.
   1182                assert replacement == (
   1183                    None,
   1184                    None,
   1185                    Any,
   1186                    None,
   1187                ), f"Unhandled replacement in region alias: {replacement}"
   1188 
   1189                if type not in territory_exception_rules:
   1190                    region_mappings[region] = r_region
   1191                else:
   1192                    complex_region_mappings[region] = [
   1193                        r_region
   1194                        for (_, _, r_region, _) in territory_exception_rules[type]
   1195                    ]
   1196            else:
   1197                assert variants is not None
   1198                assert len(variants.split("-")) == 1
   1199 
   1200                # We only support one-to-one variant mappings for now.
   1201                assert multi_map_size(replacement) <= 1, (
   1202                    f"Unhandled replacement in variant alias: {replacement}"
   1203                )
   1204 
   1205                if r_language is not None:
   1206                    variant_mappings[variants] = ("language", r_language)
   1207                elif r_script is not None:
   1208                    variant_mappings[variants] = ("script", r_script)
   1209                elif r_region is not None:
   1210                    variant_mappings[variants] = ("region", r_region)
   1211                elif r_variants is not None:
   1212                    assert len(r_variants.split("-")) == 1
   1213                    variant_mappings[variants] = ("variant", r_variants)
   1214                else:
   1215                    variant_mappings[variants] = None
   1216        else:
   1217            # Alias rules which have multiple input fields must be processed
   1218            # first. This applies only to a handful of rules, so our generated
   1219            # code adds fast paths to skip these rules in the common case.
   1220 
   1221            # Case 1: Language and at least one variant subtag.
   1222            if language is not None and variants is not None:
   1223                pass
   1224 
   1225            # Case 2: Sign language and a region subtag.
   1226            elif language == "sgn" and region is not None:
   1227                pass
   1228 
   1229            # Case 3: "hepburn-heploc" to "alalc97" canonicalization.
   1230            elif (
   1231                language is None
   1232                and variants is not None
   1233                and len(variants.split("-")) == 2
   1234            ):
   1235                pass
   1236 
   1237            # Any other combination is currently unsupported.
   1238            else:
   1239                raise ValueError(f"{type} -> {replacement}")
   1240 
   1241            legacy_mappings[type] = replacement
   1242 
   1243    tree = ET.parse(core_file.open("common/supplemental/likelySubtags.xml"))
   1244 
   1245    likely_subtags = {}
   1246 
   1247    for likely_subtag in tree.iterfind(".//likelySubtag"):
   1248        from_tag = bcp47_id(likely_subtag.get("from"))
   1249        from_match = re_unicode_language_id.match(from_tag)
   1250        assert from_match is not None, (
   1251            f"{from_tag} invalid Unicode BCP 47 locale identifier"
   1252        )
   1253        assert from_match.group("variants") is None, (
   1254            f"unexpected variant subtags in {from_tag}"
   1255        )
   1256 
   1257        to_tag = bcp47_id(likely_subtag.get("to"))
   1258        to_match = re_unicode_language_id.match(to_tag)
   1259        assert to_match is not None, (
   1260            f"{to_tag} invalid Unicode BCP 47 locale identifier"
   1261        )
   1262        assert to_match.group("variants") is None, (
   1263            f"unexpected variant subtags in {to_tag}"
   1264        )
   1265 
   1266        from_canonical = bcp47_canonical(
   1267            *from_match.group("language", "script", "region", "variants")
   1268        )
   1269 
   1270        to_canonical = bcp47_canonical(
   1271            *to_match.group("language", "script", "region", "variants")
   1272        )
   1273 
   1274        # Remove the empty variant subtags.
   1275        from_canonical = from_canonical[:-1]
   1276        to_canonical = to_canonical[:-1]
   1277 
   1278        likely_subtags[from_canonical] = to_canonical
   1279 
   1280    complex_region_mappings_final = {}
   1281 
   1282    for deprecated_region, replacements in complex_region_mappings.items():
   1283        # Find all likely subtag entries which don't already contain a region
   1284        # subtag and whose target region is in the list of replacement regions.
   1285        region_likely_subtags = [
   1286            (from_language, from_script, to_region)
   1287            for (
   1288                (from_language, from_script, from_region),
   1289                (_, _, to_region),
   1290            ) in likely_subtags.items()
   1291            if from_region is None and to_region in replacements
   1292        ]
   1293 
   1294        # The first replacement entry is the default region.
   1295        default = replacements[0]
   1296 
   1297        # Find all likely subtag entries whose region matches the default region.
   1298        default_replacements = {
   1299            (language, script)
   1300            for (language, script, region) in region_likely_subtags
   1301            if region == default
   1302        }
   1303 
   1304        # And finally find those entries which don't use the default region.
   1305        # These are the entries we're actually interested in, because those need
   1306        # to be handled specially when selecting the correct preferred region.
   1307        non_default_replacements = [
   1308            (language, script, region)
   1309            for (language, script, region) in region_likely_subtags
   1310            if (language, script) not in default_replacements
   1311        ]
   1312 
   1313        # Remove redundant mappings.
   1314        #
   1315        # For example starting with CLDR 43, the deprecated region "SU" has the
   1316        # following non-default replacement entries for "GE":
   1317        # - ('sva', None, 'GE')
   1318        # - ('sva', 'Cyrl', 'GE')
   1319        # - ('sva', 'Latn', 'GE')
   1320        #
   1321        # The latter two entries are redundant, because they're already handled
   1322        # by the first entry.
   1323        non_default_replacements = [
   1324            (language, script, region)
   1325            for (language, script, region) in non_default_replacements
   1326            if script is None
   1327            or (language, None, region) not in non_default_replacements
   1328        ]
   1329 
   1330        # If there are no non-default replacements, we can handle the region as
   1331        # part of the simple region mapping.
   1332        if non_default_replacements:
   1333            complex_region_mappings_final[deprecated_region] = (
   1334                default,
   1335                non_default_replacements,
   1336            )
   1337        else:
   1338            region_mappings[deprecated_region] = default
   1339 
   1340    return {
   1341        "legacyMappings": legacy_mappings,
   1342        "languageMappings": language_mappings,
   1343        "complexLanguageMappings": complex_language_mappings,
   1344        "scriptMappings": script_mappings,
   1345        "regionMappings": region_mappings,
   1346        "complexRegionMappings": complex_region_mappings_final,
   1347        "variantMappings": variant_mappings,
   1348        "likelySubtags": likely_subtags,
   1349    }
   1350 
   1351 
   1352 def readUnicodeExtensions(core_file):
   1353    import xml.etree.ElementTree as ET
   1354 
   1355    # Match all xml-files in the BCP 47 directory.
   1356    bcpFileRE = re.compile(r"^common/bcp47/.+\.xml$")
   1357 
   1358    # https://www.unicode.org/reports/tr35/#Unicode_locale_identifier
   1359    #
   1360    # type = alphanum{3,8} (sep alphanum{3,8})* ;
   1361    typeRE = re.compile(r"^[a-z0-9]{3,8}(-[a-z0-9]{3,8})*$")
   1362 
   1363    # https://www.unicode.org/reports/tr35/#Unicode_language_identifier
   1364    #
   1365    # unicode_region_subtag = alpha{2} ;
   1366    alphaRegionRE = re.compile(r"^[A-Z]{2}$", re.IGNORECASE)
   1367 
   1368    # Mapping from Unicode extension types to dict of deprecated to
   1369    # preferred values.
   1370    mapping = {
   1371        # Unicode BCP 47 U Extension
   1372        "u": {},
   1373        # Unicode BCP 47 T Extension
   1374        "t": {},
   1375    }
   1376 
   1377    def readBCP47File(file):
   1378        tree = ET.parse(file)
   1379        for keyword in tree.iterfind(".//keyword/key"):
   1380            extension = keyword.get("extension", "u")
   1381            assert extension in {"u", "t"}, f"unknown extension type: {extension}"
   1382 
   1383            extension_name = keyword.get("name")
   1384 
   1385            for type in keyword.iterfind("type"):
   1386                # <https://unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files>:
   1387                #
   1388                # The key or type name used by Unicode locale extension with 'u' extension
   1389                # syntax or the 't' extensions syntax. When alias below is absent, this name
   1390                # can be also used with the old style "@key=type" syntax.
   1391                name = type.get("name")
   1392 
   1393                # Ignore the special name:
   1394                # - <https://unicode.org/reports/tr35/#CODEPOINTS>
   1395                # - <https://unicode.org/reports/tr35/#REORDER_CODE>
   1396                # - <https://unicode.org/reports/tr35/#RG_KEY_VALUE>
   1397                # - <https://unicode.org/reports/tr35/#SCRIPT_CODE>
   1398                # - <https://unicode.org/reports/tr35/#SUBDIVISION_CODE>
   1399                # - <https://unicode.org/reports/tr35/#PRIVATE_USE>
   1400                if name in (
   1401                    "CODEPOINTS",
   1402                    "REORDER_CODE",
   1403                    "RG_KEY_VALUE",
   1404                    "SCRIPT_CODE",
   1405                    "SUBDIVISION_CODE",
   1406                    "PRIVATE_USE",
   1407                ):
   1408                    continue
   1409 
   1410                # All other names should match the 'type' production.
   1411                assert typeRE.match(name) is not None, (
   1412                    f"{name} matches the 'type' production"
   1413                )
   1414 
   1415                # <https://unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files>:
   1416                #
   1417                # The preferred value of the deprecated key, type or attribute element.
   1418                # When a key, type or attribute element is deprecated, this attribute is
   1419                # used for specifying a new canonical form if available.
   1420                preferred = type.get("preferred")
   1421 
   1422                # <https://unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files>:
   1423                #
   1424                # The BCP 47 form is the canonical form, and recommended. Other aliases are
   1425                # included only for backwards compatibility.
   1426                alias = type.get("alias")
   1427 
   1428                # <https://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers>
   1429                #
   1430                # Use the bcp47 data to replace keys, types, tfields, and tvalues by their
   1431                # canonical forms. See Section 3.6.4 U Extension Data Files) and Section
   1432                # 3.7.1 T Extension Data Files. The aliases are in the alias attribute
   1433                # value, while the canonical is in the name attribute value.
   1434 
   1435                # 'preferred' contains the new preferred name, 'alias' the compatibility
   1436                # name, but then there's this entry where 'preferred' and 'alias' are the
   1437                # same. So which one to choose? Assume 'preferred' is the actual canonical
   1438                # name.
   1439                #
   1440                # <type name="islamicc"
   1441                #       description="Civil (algorithmic) Arabic calendar"
   1442                #       deprecated="true"
   1443                #       preferred="islamic-civil"
   1444                #       alias="islamic-civil"/>
   1445 
   1446                if preferred is not None:
   1447                    assert typeRE.match(preferred), preferred
   1448                    mapping[extension].setdefault(extension_name, {})[name] = preferred
   1449 
   1450                if alias is not None:
   1451                    for alias_name in alias.lower().split(" "):
   1452                        # Ignore alias entries which don't match the 'type' production.
   1453                        if typeRE.match(alias_name) is None:
   1454                            continue
   1455 
   1456                        # See comment above when 'alias' and 'preferred' are both present.
   1457                        if (
   1458                            preferred is not None
   1459                            and name in mapping[extension][extension_name]
   1460                        ):
   1461                            continue
   1462 
   1463                        # Skip over entries where 'name' and 'alias' are equal.
   1464                        #
   1465                        # <type name="pst8pdt"
   1466                        #       description="POSIX style time zone for US Pacific Time"
   1467                        #       alias="PST8PDT"
   1468                        #       since="1.8"/>
   1469                        if name == alias_name:
   1470                            continue
   1471 
   1472                        mapping[extension].setdefault(extension_name, {})[
   1473                            alias_name
   1474                        ] = name
   1475 
   1476    def readSupplementalMetadata(file):
   1477        # Find subdivision and region replacements.
   1478        #
   1479        # <https://www.unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers>
   1480        #
   1481        # Replace aliases in special key values:
   1482        #   - If there is an 'sd' or 'rg' key, replace any subdivision alias
   1483        #     in its value in the same way, using subdivisionAlias data.
   1484        tree = ET.parse(file)
   1485        for alias in tree.iterfind(".//subdivisionAlias"):
   1486            type = alias.get("type")
   1487            assert typeRE.match(type) is not None, (
   1488                f"{type} matches the 'type' production"
   1489            )
   1490 
   1491            # Take the first replacement when multiple ones are present.
   1492            replacement = alias.get("replacement").split(" ")[0].lower()
   1493 
   1494            # Append "zzzz" if the replacement is a two-letter region code.
   1495            if alphaRegionRE.match(replacement) is not None:
   1496                replacement += "zzzz"
   1497 
   1498            # Assert the replacement is syntactically correct.
   1499            assert typeRE.match(replacement) is not None, (
   1500                f"replacement {replacement} matches the 'type' production"
   1501            )
   1502 
   1503            # 'subdivisionAlias' applies to 'rg' and 'sd' keys.
   1504            mapping["u"].setdefault("rg", {})[type] = replacement
   1505            mapping["u"].setdefault("sd", {})[type] = replacement
   1506 
   1507    for name in core_file.namelist():
   1508        if bcpFileRE.match(name):
   1509            readBCP47File(core_file.open(name))
   1510 
   1511    readSupplementalMetadata(
   1512        core_file.open("common/supplemental/supplementalMetadata.xml")
   1513    )
   1514 
   1515    return {
   1516        "unicodeMappings": mapping["u"],
   1517        "transformMappings": mapping["t"],
   1518    }
   1519 
   1520 
   1521 def writeCLDRLanguageTagData(println, data, url):
   1522    """Writes the language tag data to the Intl data file."""
   1523 
   1524    println(generatedFileWarning)
   1525    println("// Version: CLDR-{}".format(data["version"]))
   1526    println(f"// URL: {url}")
   1527 
   1528    println(
   1529        """
   1530 #include "mozilla/Assertions.h"
   1531 #include "mozilla/Span.h"
   1532 #include "mozilla/TextUtils.h"
   1533 
   1534 #include <algorithm>
   1535 #include <cstdint>
   1536 #include <cstring>
   1537 #include <iterator>
   1538 #include <string>
   1539 
   1540 #include "mozilla/intl/Locale.h"
   1541 
   1542 using namespace mozilla::intl::LanguageTagLimits;
   1543 
   1544 template <size_t Length, size_t TagLength, size_t SubtagLength>
   1545 static inline bool HasReplacement(
   1546    const char (&subtags)[Length][TagLength],
   1547    const mozilla::intl::LanguageTagSubtag<SubtagLength>& subtag) {
   1548  MOZ_ASSERT(subtag.Length() == TagLength - 1,
   1549             "subtag must have the same length as the list of subtags");
   1550 
   1551  const char* ptr = subtag.Span().data();
   1552  return std::binary_search(std::begin(subtags), std::end(subtags), ptr,
   1553                            [](const char* a, const char* b) {
   1554                              return memcmp(a, b, TagLength - 1) < 0;
   1555                            });
   1556 }
   1557 
   1558 template <size_t Length, size_t TagLength, size_t SubtagLength>
   1559 static inline const char* SearchReplacement(
   1560    const char (&subtags)[Length][TagLength], const char* (&aliases)[Length],
   1561    const mozilla::intl::LanguageTagSubtag<SubtagLength>& subtag) {
   1562  MOZ_ASSERT(subtag.Length() == TagLength - 1,
   1563             "subtag must have the same length as the list of subtags");
   1564 
   1565  const char* ptr = subtag.Span().data();
   1566  auto p = std::lower_bound(std::begin(subtags), std::end(subtags), ptr,
   1567                            [](const char* a, const char* b) {
   1568                              return memcmp(a, b, TagLength - 1) < 0;
   1569                            });
   1570  if (p != std::end(subtags) && memcmp(*p, ptr, TagLength - 1) == 0) {
   1571    return aliases[std::distance(std::begin(subtags), p)];
   1572  }
   1573  return nullptr;
   1574 }
   1575 
   1576 #ifdef DEBUG
   1577 static bool IsAsciiLowercaseAlphanumeric(char c) {
   1578  return mozilla::IsAsciiLowercaseAlpha(c) || mozilla::IsAsciiDigit(c);
   1579 }
   1580 
   1581 static bool IsAsciiLowercaseAlphanumericOrDash(char c) {
   1582  return IsAsciiLowercaseAlphanumeric(c) || c == '-';
   1583 }
   1584 
   1585 static bool IsCanonicallyCasedLanguageTag(mozilla::Span<const char> span) {
   1586  return std::all_of(span.begin(), span.end(),
   1587                     mozilla::IsAsciiLowercaseAlpha<char>);
   1588 }
   1589 
   1590 static bool IsCanonicallyCasedScriptTag(mozilla::Span<const char> span) {
   1591  return mozilla::IsAsciiUppercaseAlpha(span[0]) &&
   1592         std::all_of(span.begin() + 1, span.end(),
   1593                     mozilla::IsAsciiLowercaseAlpha<char>);
   1594 }
   1595 
   1596 static bool IsCanonicallyCasedRegionTag(mozilla::Span<const char> span) {
   1597  return std::all_of(span.begin(), span.end(),
   1598                     mozilla::IsAsciiUppercaseAlpha<char>) ||
   1599         std::all_of(span.begin(), span.end(), mozilla::IsAsciiDigit<char>);
   1600 }
   1601 
   1602 static bool IsCanonicallyCasedVariantTag(mozilla::Span<const char> span) {
   1603  return std::all_of(span.begin(), span.end(), IsAsciiLowercaseAlphanumeric);
   1604 }
   1605 
   1606 static bool IsCanonicallyCasedUnicodeKey(mozilla::Span<const char> key) {
   1607  return std::all_of(key.begin(), key.end(), IsAsciiLowercaseAlphanumeric);
   1608 }
   1609 
   1610 static bool IsCanonicallyCasedUnicodeType(mozilla::Span<const char> type) {
   1611  return std::all_of(type.begin(), type.end(),
   1612                     IsAsciiLowercaseAlphanumericOrDash);
   1613 }
   1614 
   1615 static bool IsCanonicallyCasedTransformKey(mozilla::Span<const char> key) {
   1616  return std::all_of(key.begin(), key.end(), IsAsciiLowercaseAlphanumeric);
   1617 }
   1618 
   1619 static bool IsCanonicallyCasedTransformType(mozilla::Span<const char> type) {
   1620  return std::all_of(type.begin(), type.end(),
   1621                     IsAsciiLowercaseAlphanumericOrDash);
   1622 }
   1623 #endif
   1624 """.rstrip()
   1625    )
   1626 
   1627    source = "CLDR Supplemental Data, version {}".format(data["version"])
   1628    legacy_mappings = data["legacyMappings"]
   1629    language_mappings = data["languageMappings"]
   1630    complex_language_mappings = data["complexLanguageMappings"]
   1631    script_mappings = data["scriptMappings"]
   1632    region_mappings = data["regionMappings"]
   1633    complex_region_mappings = data["complexRegionMappings"]
   1634    variant_mappings = data["variantMappings"]
   1635    unicode_mappings = data["unicodeMappings"]
   1636    transform_mappings = data["transformMappings"]
   1637 
   1638    # unicode_language_subtag = alpha{2,3} | alpha{5,8} ;
   1639    language_maxlength = 8
   1640 
   1641    # unicode_script_subtag = alpha{4} ;
   1642    script_maxlength = 4
   1643 
   1644    # unicode_region_subtag = (alpha{2} | digit{3}) ;
   1645    region_maxlength = 3
   1646 
   1647    writeMappingsBinarySearch(
   1648        println,
   1649        "LanguageMapping",
   1650        "LanguageSubtag&",
   1651        "language",
   1652        "IsStructurallyValidLanguageTag",
   1653        "IsCanonicallyCasedLanguageTag",
   1654        language_mappings,
   1655        language_maxlength,
   1656        "Mappings from language subtags to preferred values.",
   1657        source,
   1658        url,
   1659    )
   1660    writeMappingsBinarySearch(
   1661        println,
   1662        "ComplexLanguageMapping",
   1663        "const LanguageSubtag&",
   1664        "language",
   1665        "IsStructurallyValidLanguageTag",
   1666        "IsCanonicallyCasedLanguageTag",
   1667        complex_language_mappings.keys(),
   1668        language_maxlength,
   1669        "Language subtags with complex mappings.",
   1670        source,
   1671        url,
   1672    )
   1673    writeMappingsBinarySearch(
   1674        println,
   1675        "ScriptMapping",
   1676        "ScriptSubtag&",
   1677        "script",
   1678        "IsStructurallyValidScriptTag",
   1679        "IsCanonicallyCasedScriptTag",
   1680        script_mappings,
   1681        script_maxlength,
   1682        "Mappings from script subtags to preferred values.",
   1683        source,
   1684        url,
   1685    )
   1686    writeMappingsBinarySearch(
   1687        println,
   1688        "RegionMapping",
   1689        "RegionSubtag&",
   1690        "region",
   1691        "IsStructurallyValidRegionTag",
   1692        "IsCanonicallyCasedRegionTag",
   1693        region_mappings,
   1694        region_maxlength,
   1695        "Mappings from region subtags to preferred values.",
   1696        source,
   1697        url,
   1698    )
   1699    writeMappingsBinarySearch(
   1700        println,
   1701        "ComplexRegionMapping",
   1702        "const RegionSubtag&",
   1703        "region",
   1704        "IsStructurallyValidRegionTag",
   1705        "IsCanonicallyCasedRegionTag",
   1706        complex_region_mappings.keys(),
   1707        region_maxlength,
   1708        "Region subtags with complex mappings.",
   1709        source,
   1710        url,
   1711    )
   1712 
   1713    writeComplexLanguageTagMappings(
   1714        println,
   1715        complex_language_mappings,
   1716        "Language subtags with complex mappings.",
   1717        source,
   1718        url,
   1719    )
   1720    writeComplexRegionTagMappings(
   1721        println,
   1722        complex_region_mappings,
   1723        "Region subtags with complex mappings.",
   1724        source,
   1725        url,
   1726    )
   1727 
   1728    writeVariantTagMappings(
   1729        println,
   1730        variant_mappings,
   1731        "Mappings from variant subtags to preferred values.",
   1732        source,
   1733        url,
   1734    )
   1735 
   1736    writeLegacyMappingsFunction(
   1737        println, legacy_mappings, "Canonicalize legacy locale identifiers.", source, url
   1738    )
   1739 
   1740    writeSignLanguageMappingsFunction(
   1741        println, legacy_mappings, "Mappings from legacy sign languages.", source, url
   1742    )
   1743 
   1744    writeUnicodeExtensionsMappings(println, unicode_mappings, "Unicode")
   1745    writeUnicodeExtensionsMappings(println, transform_mappings, "Transform")
   1746 
   1747 
   1748 def writeCLDRLanguageTagLikelySubtagsTest(println, data, url):
   1749    """Writes the likely-subtags test file."""
   1750 
   1751    println(generatedFileWarning)
   1752 
   1753    source = "CLDR Supplemental Data, version {}".format(data["version"])
   1754    language_mappings = data["languageMappings"]
   1755    complex_language_mappings = data["complexLanguageMappings"]
   1756    script_mappings = data["scriptMappings"]
   1757    region_mappings = data["regionMappings"]
   1758    complex_region_mappings = data["complexRegionMappings"]
   1759    likely_subtags = data["likelySubtags"]
   1760 
   1761    def bcp47(tag):
   1762        (language, script, region) = tag
   1763        return "{}{}{}".format(
   1764            language, "-" + script if script else "", "-" + region if region else ""
   1765        )
   1766 
   1767    def canonical(tag):
   1768        (language, script, region) = tag
   1769 
   1770        # Map deprecated language subtags.
   1771        if language in language_mappings:
   1772            language = language_mappings[language]
   1773        elif language in complex_language_mappings:
   1774            (language2, script2, region2) = complex_language_mappings[language]
   1775            (language, script, region) = (
   1776                language2,
   1777                script if script else script2,
   1778                region if region else region2,
   1779            )
   1780 
   1781        # Map deprecated script subtags.
   1782        if script in script_mappings:
   1783            script = script_mappings[script]
   1784 
   1785        # Map deprecated region subtags.
   1786        if region in region_mappings:
   1787            region = region_mappings[region]
   1788        else:
   1789            # Assume no complex region mappings are needed for now.
   1790            assert region not in complex_region_mappings, (
   1791                f"unexpected region with complex mappings: {region}"
   1792            )
   1793 
   1794        return (language, script, region)
   1795 
   1796    # https://unicode.org/reports/tr35/#Likely_Subtags
   1797 
   1798    def addLikelySubtags(tag):
   1799        # Step 1: Canonicalize.
   1800        (language, script, region) = canonical(tag)
   1801        if script == "Zzzz":
   1802            script = None
   1803        if region == "ZZ":
   1804            region = None
   1805 
   1806        # Step 2: Lookup.
   1807        searches = (
   1808            (language, script, region),
   1809            (language, script, None),
   1810            (language, None, region),
   1811            (language, None, None),
   1812        )
   1813        search = next(search for search in searches if search in likely_subtags)
   1814 
   1815        (language_s, script_s, region_s) = search
   1816        (language_m, script_m, region_m) = likely_subtags[search]
   1817 
   1818        # Step 3: Return.
   1819        return (
   1820            language if language != language_s else language_m,
   1821            script if script != script_s else script_m,
   1822            region if region != region_s else region_m,
   1823        )
   1824 
   1825    # https://unicode.org/reports/tr35/#Likely_Subtags
   1826    def removeLikelySubtags(tag):
   1827        # Step 1: Add likely subtags.
   1828        max = addLikelySubtags(tag)
   1829 
   1830        # Step 2: Remove variants (doesn't apply here).
   1831 
   1832        # Step 3: Find a match.
   1833        (language, script, region) = max
   1834        for trial in (
   1835            (language, None, None),
   1836            (language, None, region),
   1837            (language, script, None),
   1838        ):
   1839            if addLikelySubtags(trial) == max:
   1840                return trial
   1841 
   1842        # Step 4: Return maximized if no match found.
   1843        return max
   1844 
   1845    def likely_canonical(from_tag, to_tag):
   1846        # Canonicalize the input tag.
   1847        from_tag = canonical(from_tag)
   1848 
   1849        # Update the expected result if necessary.
   1850        if from_tag in likely_subtags:
   1851            to_tag = likely_subtags[from_tag]
   1852 
   1853        # Canonicalize the expected output.
   1854        to_canonical = canonical(to_tag)
   1855 
   1856        # Sanity check: This should match the result of |addLikelySubtags|.
   1857        assert to_canonical == addLikelySubtags(from_tag)
   1858 
   1859        return to_canonical
   1860 
   1861    # |likely_subtags| contains non-canonicalized tags, so canonicalize it first.
   1862    likely_subtags_canonical = {
   1863        k: likely_canonical(k, v) for (k, v) in likely_subtags.items()
   1864    }
   1865 
   1866    # Add test data for |Intl.Locale.prototype.maximize()|.
   1867    writeMappingsVar(
   1868        println,
   1869        {bcp47(k): bcp47(v) for (k, v) in likely_subtags_canonical.items()},
   1870        "maxLikelySubtags",
   1871        "Extracted from likelySubtags.xml.",
   1872        source,
   1873        url,
   1874    )
   1875 
   1876    # Use the maximalized tags as the input for the remove likely-subtags test.
   1877    minimized = {
   1878        tag: removeLikelySubtags(tag) for tag in likely_subtags_canonical.values()
   1879    }
   1880 
   1881    # Add test data for |Intl.Locale.prototype.minimize()|.
   1882    writeMappingsVar(
   1883        println,
   1884        {bcp47(k): bcp47(v) for (k, v) in minimized.items()},
   1885        "minLikelySubtags",
   1886        "Extracted from likelySubtags.xml.",
   1887        source,
   1888        url,
   1889    )
   1890 
   1891    println(
   1892        """
   1893 for (let [tag, maximal] of Object.entries(maxLikelySubtags)) {
   1894    assertEq(new Intl.Locale(tag).maximize().toString(), maximal);
   1895 }"""
   1896    )
   1897 
   1898    println(
   1899        """
   1900 for (let [tag, minimal] of Object.entries(minLikelySubtags)) {
   1901    assertEq(new Intl.Locale(tag).minimize().toString(), minimal);
   1902 }"""
   1903    )
   1904 
   1905    println(
   1906        """
   1907 if (typeof reportCompare === "function")
   1908    reportCompare(0, 0);"""
   1909    )
   1910 
   1911 
   1912 def readCLDRVersionFromICU():
   1913    icuDir = os.path.join(topsrcdir, "intl/icu/source")
   1914    if not os.path.isdir(icuDir):
   1915        raise RuntimeError(f"not a directory: {icuDir}")
   1916 
   1917    reVersion = re.compile(r'\s*cldrVersion\{"(\d+(?:\.\d+)?)"\}')
   1918 
   1919    for line in flines(os.path.join(icuDir, "data/misc/supplementalData.txt")):
   1920        m = reVersion.match(line)
   1921        if m:
   1922            version = m.group(1)
   1923            break
   1924 
   1925    if version is None:
   1926        raise RuntimeError("can't resolve CLDR version")
   1927 
   1928    return version
   1929 
   1930 
   1931 def updateCLDRLangTags(args):
   1932    """Update the LanguageTagGenerated.cpp file."""
   1933    version = args.version
   1934    url = args.url
   1935    out = args.out
   1936    filename = args.file
   1937 
   1938    # Determine current CLDR version from ICU.
   1939    if version is None:
   1940        version = readCLDRVersionFromICU()
   1941 
   1942    url = url.replace("<VERSION>", version)
   1943 
   1944    print("Arguments:")
   1945    print("\tCLDR version: %s" % version)
   1946    print("\tDownload url: %s" % url)
   1947    if filename is not None:
   1948        print("\tLocal CLDR common.zip file: %s" % filename)
   1949    print("\tOutput file: %s" % out)
   1950    print("")
   1951 
   1952    data = {
   1953        "version": version,
   1954    }
   1955 
   1956    def readFiles(cldr_file):
   1957        with ZipFile(cldr_file) as zip_file:
   1958            data.update(readSupplementalData(zip_file))
   1959            data.update(readUnicodeExtensions(zip_file))
   1960 
   1961    print("Processing CLDR data...")
   1962    if filename is not None:
   1963        print("Always make sure you have the newest CLDR common.zip!")
   1964        with open(filename, "rb") as cldr_file:
   1965            readFiles(cldr_file)
   1966    else:
   1967        print("Downloading CLDR common.zip...")
   1968        with closing(urlopen(url)) as cldr_file:
   1969            cldr_data = io.BytesIO(cldr_file.read())
   1970            readFiles(cldr_data)
   1971 
   1972    print("Writing Intl data...")
   1973    with open(out, mode="w", encoding="utf-8", newline="") as f:
   1974        println = partial(print, file=f)
   1975 
   1976        writeCLDRLanguageTagData(println, data, url)
   1977 
   1978    print("Writing Intl test data...")
   1979    js_src_builtin_intl_dir = os.path.dirname(os.path.abspath(__file__))
   1980    test_file = os.path.join(
   1981        js_src_builtin_intl_dir,
   1982        "../../tests/non262/Intl/Locale/likely-subtags-generated.js",
   1983    )
   1984    with open(test_file, mode="w", encoding="utf-8", newline="") as f:
   1985        println = partial(print, file=f)
   1986 
   1987        println("// |reftest| skip-if(!this.hasOwnProperty('Intl'))")
   1988        writeCLDRLanguageTagLikelySubtagsTest(println, data, url)
   1989 
   1990 
   1991 def flines(filepath, encoding="utf-8"):
   1992    """Open filepath and iterate over its content."""
   1993    with open(filepath, encoding=encoding) as f:
   1994        yield from f
   1995 
   1996 
   1997 @total_ordering
   1998 class Zone:
   1999    """Time zone with optional file name."""
   2000 
   2001    def __init__(self, name, filename=""):
   2002        self.name = name
   2003        self.filename = filename
   2004 
   2005    def __eq__(self, other):
   2006        return hasattr(other, "name") and self.name == other.name
   2007 
   2008    def __lt__(self, other):
   2009        return self.name < other.name
   2010 
   2011    def __hash__(self):
   2012        return hash(self.name)
   2013 
   2014    def __str__(self):
   2015        return self.name
   2016 
   2017    def __repr__(self):
   2018        return self.name
   2019 
   2020 
   2021 class TzDataDir:
   2022    """tzdata source from a directory."""
   2023 
   2024    def __init__(self, obj):
   2025        self.name = partial(os.path.basename, obj)
   2026        self.resolve = partial(os.path.join, obj)
   2027        self.basename = os.path.basename
   2028        self.isfile = os.path.isfile
   2029        self.listdir = partial(os.listdir, obj)
   2030        self.readlines = flines
   2031 
   2032 
   2033 class TzDataFile:
   2034    """tzdata source from a file (tar or gzipped)."""
   2035 
   2036    def __init__(self, obj):
   2037        self.name = lambda: os.path.splitext(
   2038            os.path.splitext(os.path.basename(obj))[0]
   2039        )[0]
   2040        self.resolve = obj.getmember
   2041        self.basename = attrgetter("name")
   2042        self.isfile = tarfile.TarInfo.isfile
   2043        self.listdir = obj.getnames
   2044        self.readlines = partial(self._tarlines, obj)
   2045 
   2046    def _tarlines(self, tar, m):
   2047        with closing(tar.extractfile(m)) as f:
   2048            for line in f:
   2049                yield line.decode("utf-8")
   2050 
   2051 
   2052 def validateTimeZones(zones, links):
   2053    """Validate the zone and link entries."""
   2054    linkZones = set(links.keys())
   2055    intersect = linkZones.intersection(zones)
   2056    if intersect:
   2057        raise RuntimeError("Links also present in zones: %s" % intersect)
   2058 
   2059    zoneNames = {z.name for z in zones}
   2060    linkTargets = set(links.values())
   2061    if not linkTargets.issubset(zoneNames):
   2062        raise RuntimeError(
   2063            "Link targets not found: %s" % linkTargets.difference(zoneNames)
   2064        )
   2065 
   2066 
   2067 def partition(iterable, *predicates):
   2068    def innerPartition(pred, it):
   2069        it1, it2 = tee(it)
   2070        return (filter(pred, it1), filterfalse(pred, it2))
   2071 
   2072    if len(predicates) == 0:
   2073        return iterable
   2074    (left, right) = innerPartition(predicates[0], iterable)
   2075    if len(predicates) == 1:
   2076        return (left, right)
   2077    return tuple([left] + list(partition(right, *predicates[1:])))
   2078 
   2079 
   2080 def listIANAFiles(tzdataDir):
   2081    def isTzFile(d, m, f):
   2082        return m(f) and d.isfile(d.resolve(f))
   2083 
   2084    return filter(
   2085        partial(isTzFile, tzdataDir, re.compile("^[a-z0-9]+$").match),
   2086        tzdataDir.listdir(),
   2087    )
   2088 
   2089 
   2090 def readIANAFiles(tzdataDir, files):
   2091    """Read all IANA time zone files from the given iterable."""
   2092    nameSyntax = r"[\w/+\-]+"
   2093    pZone = re.compile(r"Zone\s+(?P<name>%s)\s+.*" % nameSyntax)
   2094    pLink = re.compile(
   2095        r"(#PACKRATLIST\s+zone.tab\s+)?Link\s+(?P<target>%s)\s+(?P<name>%s)(?:\s+#.*)?"
   2096        % (nameSyntax, nameSyntax)
   2097    )
   2098 
   2099    def createZone(line, fname):
   2100        match = pZone.match(line)
   2101        name = match.group("name")
   2102        return Zone(name, fname)
   2103 
   2104    def createLink(line, fname):
   2105        match = pLink.match(line)
   2106        (name, target) = match.group("name", "target")
   2107        return (Zone(name, fname), target)
   2108 
   2109    zones = set()
   2110    links = dict()
   2111    packrat_links = dict()
   2112    for filename in files:
   2113        filepath = tzdataDir.resolve(filename)
   2114        for line in tzdataDir.readlines(filepath):
   2115            if line.startswith("Zone"):
   2116                zones.add(createZone(line, filename))
   2117            if line.startswith("Link"):
   2118                (link, target) = createLink(line, filename)
   2119                links[link] = target
   2120            if line.startswith("#PACKRATLIST zone.tab Link"):
   2121                (link, target) = createLink(line, filename)
   2122                packrat_links[link] = target
   2123 
   2124    return (zones, links, packrat_links)
   2125 
   2126 
   2127 def readIANATimeZones(tzdataDir, ignoreFactory):
   2128    """Read the IANA time zone information from `tzdataDir`."""
   2129 
   2130    files_to_ignore = ["backzone"]
   2131 
   2132    # Ignore the placeholder time zone "Factory".
   2133    if ignoreFactory:
   2134        files_to_ignore.append("factory")
   2135 
   2136    tzfiles = (file for file in listIANAFiles(tzdataDir) if file not in files_to_ignore)
   2137 
   2138    # Read zone and link infos.
   2139    (zones, links, _) = readIANAFiles(tzdataDir, tzfiles)
   2140 
   2141    validateTimeZones(zones, links)
   2142 
   2143    return (zones, links)
   2144 
   2145 
   2146 def readICUResourceFile(filename):
   2147    """Read an ICU resource file.
   2148 
   2149    Yields (<table-name>, <startOrEnd>, <value>) for each table.
   2150    """
   2151 
   2152    numberValue = r"-?\d+"
   2153    stringValue = r'".+?"'
   2154 
   2155    def asVector(val):
   2156        return r"%s(?:\s*,\s*%s)*" % (val, val)
   2157 
   2158    numberVector = asVector(numberValue)
   2159    stringVector = asVector(stringValue)
   2160 
   2161    reNumberVector = re.compile(numberVector)
   2162    reStringVector = re.compile(stringVector)
   2163    reNumberValue = re.compile(numberValue)
   2164    reStringValue = re.compile(stringValue)
   2165 
   2166    def parseValue(value):
   2167        m = reNumberVector.match(value)
   2168        if m:
   2169            return [int(v) for v in reNumberValue.findall(value)]
   2170        m = reStringVector.match(value)
   2171        if m:
   2172            return [v[1:-1] for v in reStringValue.findall(value)]
   2173        raise RuntimeError("unknown value type: %s" % value)
   2174 
   2175    def extractValue(values):
   2176        if len(values) == 0:
   2177            return None
   2178        if len(values) == 1:
   2179            return values[0]
   2180        return values
   2181 
   2182    def line(*args):
   2183        maybeMultiComments = r"(?:/\*[^*]*\*/)*"
   2184        maybeSingleComment = r"(?://.*)?"
   2185        lineStart = "^%s" % maybeMultiComments
   2186        lineEnd = r"%s\s*%s$" % (maybeMultiComments, maybeSingleComment)
   2187        return re.compile(r"\s*".join(chain([lineStart], args, [lineEnd])))
   2188 
   2189    tableName = r'(?P<quote>"?)(?P<name>.+?)(?P=quote)'
   2190    tableValue = r"(?P<value>%s|%s)" % (numberVector, stringVector)
   2191 
   2192    reStartTable = line(tableName, r"\{")
   2193    reEndTable = line(r"\}")
   2194    reSingleValue = line(r",?", tableValue, r",?")
   2195    reCompactTable = line(tableName, r"\{", tableValue, r"\}")
   2196    reEmptyLine = line()
   2197 
   2198    tables = []
   2199 
   2200    def currentTable():
   2201        return "|".join(tables)
   2202 
   2203    values = []
   2204    for line in flines(filename, "utf-8-sig"):
   2205        line = line.strip()
   2206        if line == "":
   2207            continue
   2208 
   2209        m = reEmptyLine.match(line)
   2210        if m:
   2211            continue
   2212 
   2213        m = reStartTable.match(line)
   2214        if m:
   2215            assert len(values) == 0
   2216            tables.append(m.group("name"))
   2217            continue
   2218 
   2219        m = reEndTable.match(line)
   2220        if m:
   2221            yield (currentTable(), extractValue(values))
   2222            tables.pop()
   2223            values = []
   2224            continue
   2225 
   2226        m = reCompactTable.match(line)
   2227        if m:
   2228            assert len(values) == 0
   2229            tables.append(m.group("name"))
   2230            yield (currentTable(), extractValue(parseValue(m.group("value"))))
   2231            tables.pop()
   2232            continue
   2233 
   2234        m = reSingleValue.match(line)
   2235        if m and tables:
   2236            values.extend(parseValue(m.group("value")))
   2237            continue
   2238 
   2239        raise RuntimeError("unknown entry: %s" % line)
   2240 
   2241 
   2242 def readICUTimeZonesFromTimezoneTypes(icuTzDir):
   2243    """Read the ICU time zone information from `icuTzDir`/timezoneTypes.txt
   2244    and returns the tuple (zones, links).
   2245    """
   2246    typeMapTimeZoneKey = "timezoneTypes:table(nofallback)|typeMap|timezone|"
   2247    typeAliasTimeZoneKey = "timezoneTypes:table(nofallback)|typeAlias|timezone|"
   2248 
   2249    def toTimeZone(name):
   2250        return Zone(name.replace(":", "/"))
   2251 
   2252    zones = set()
   2253    links = dict()
   2254 
   2255    for name, value in readICUResourceFile(os.path.join(icuTzDir, "timezoneTypes.txt")):
   2256        if name.startswith(typeMapTimeZoneKey):
   2257            zones.add(toTimeZone(name[len(typeMapTimeZoneKey) :]))
   2258        if name.startswith(typeAliasTimeZoneKey):
   2259            links[toTimeZone(name[len(typeAliasTimeZoneKey) :])] = value
   2260 
   2261    validateTimeZones(zones, links)
   2262 
   2263    return (zones, links)
   2264 
   2265 
   2266 def readICUTimeZonesFromZoneInfo(icuTzDir):
   2267    """Read the ICU time zone information from `icuTzDir`/zoneinfo64.txt
   2268    and returns the tuple (zones, links).
   2269    """
   2270    zoneKey = "zoneinfo64:table(nofallback)|Zones:array|:table"
   2271    linkKey = "zoneinfo64:table(nofallback)|Zones:array|:int"
   2272    namesKey = "zoneinfo64:table(nofallback)|Names"
   2273 
   2274    tzId = 0
   2275    tzLinks = dict()
   2276    tzNames = []
   2277 
   2278    for name, value in readICUResourceFile(os.path.join(icuTzDir, "zoneinfo64.txt")):
   2279        if name == zoneKey:
   2280            tzId += 1
   2281        elif name == linkKey:
   2282            tzLinks[tzId] = int(value)
   2283            tzId += 1
   2284        elif name == namesKey:
   2285            tzNames.extend(value)
   2286 
   2287    links = {Zone(tzNames[zone]): tzNames[target] for (zone, target) in tzLinks.items()}
   2288    zones = {Zone(v) for v in tzNames if Zone(v) not in links}
   2289 
   2290    validateTimeZones(zones, links)
   2291 
   2292    return (zones, links)
   2293 
   2294 
   2295 def readICUTimeZones(icuDir, icuTzDir, ignoreFactory):
   2296    # zoneinfo64.txt contains the supported time zones by ICU. This data is
   2297    # generated from tzdata files, it doesn't include "backzone" in stock ICU.
   2298    (zoneinfoZones, zoneinfoLinks) = readICUTimeZonesFromZoneInfo(icuTzDir)
   2299 
   2300    # timezoneTypes.txt contains the canonicalization information for ICU. This
   2301    # data is generated from CLDR files. It includes data about time zones from
   2302    # tzdata's "backzone" file.
   2303    (typesZones, typesLinks) = readICUTimeZonesFromTimezoneTypes(icuTzDir)
   2304 
   2305    # Remove the placeholder time zone "Factory".
   2306    # See also <https://github.com/eggert/tz/blob/master/factory>.
   2307    if ignoreFactory:
   2308        assert Zone("Factory") in zoneinfoZones
   2309        assert Zone("Factory") not in zoneinfoLinks
   2310        assert Zone("Factory") not in typesZones
   2311        assert Zone("Factory") in typesLinks
   2312 
   2313        zoneinfoZones.remove(Zone("Factory"))
   2314        del typesLinks[Zone("Factory")]
   2315 
   2316    # Remove the ICU placeholder time zone "Etc/Unknown".
   2317    # See also <https://unicode.org/reports/tr35/#Time_Zone_Identifiers>.
   2318    for zones in (zoneinfoZones, typesZones):
   2319        zones.remove(Zone("Etc/Unknown"))
   2320 
   2321    # Remove any outdated ICU links.
   2322    for links in (zoneinfoLinks, typesLinks):
   2323        for zone in otherICULegacyLinks().keys():
   2324            if zone not in links:
   2325                raise KeyError(f"Can't remove non-existent link from '{zone}'")
   2326            del links[zone]
   2327 
   2328    # Information in zoneinfo64 should be a superset of timezoneTypes.
   2329    def inZoneInfo64(zone):
   2330        return zone in zoneinfoZones or zone in zoneinfoLinks
   2331 
   2332    notFoundInZoneInfo64 = [zone for zone in typesZones if not inZoneInfo64(zone)]
   2333    if notFoundInZoneInfo64:
   2334        raise RuntimeError(
   2335            "Missing time zones in zoneinfo64.txt: %s" % notFoundInZoneInfo64
   2336        )
   2337 
   2338    notFoundInZoneInfo64 = [
   2339        zone for zone in typesLinks.keys() if not inZoneInfo64(zone)
   2340    ]
   2341    if notFoundInZoneInfo64:
   2342        raise RuntimeError(
   2343            "Missing time zones in zoneinfo64.txt: %s" % notFoundInZoneInfo64
   2344        )
   2345 
   2346    # zoneinfo64.txt only defines the supported time zones by ICU, the canonicalization
   2347    # rules are defined through timezoneTypes.txt. Merge both to get the actual zones
   2348    # and links used by ICU.
   2349    icuZones = set(
   2350        chain(
   2351            (zone for zone in zoneinfoZones if zone not in typesLinks),
   2352            (zone for zone in typesZones),
   2353        )
   2354    )
   2355    icuLinks = dict(
   2356        chain(
   2357            (
   2358                (zone, target)
   2359                for (zone, target) in zoneinfoLinks.items()
   2360                if zone not in typesZones
   2361            ),
   2362            ((zone, target) for (zone, target) in typesLinks.items()),
   2363        )
   2364    )
   2365 
   2366    return (icuZones, icuLinks)
   2367 
   2368 
   2369 def readICULegacyZones(icuDir):
   2370    """Read the ICU legacy time zones from `icuTzDir`/tools/tzcode/icuzones
   2371    and returns the tuple (zones, links).
   2372    """
   2373    tzdir = TzDataDir(os.path.join(icuDir, "tools/tzcode"))
   2374 
   2375    # Per spec we must recognize only IANA time zones and links, but ICU
   2376    # recognizes various legacy, non-IANA time zones and links. Compute these
   2377    # non-IANA time zones and links.
   2378 
   2379    # Most legacy, non-IANA time zones and links are in the icuzones file.
   2380    (zones, links, _) = readIANAFiles(tzdir, ["icuzones"])
   2381 
   2382    # Remove the ICU placeholder time zone "Etc/Unknown".
   2383    # See also <https://unicode.org/reports/tr35/#Time_Zone_Identifiers>.
   2384    zones.remove(Zone("Etc/Unknown"))
   2385 
   2386    # A handful of non-IANA zones/links are not in icuzones and must be added
   2387    # manually so that we won't invoke ICU with them.
   2388    for zone, target in otherICULegacyLinks().items():
   2389        if zone in links:
   2390            if links[zone] != target:
   2391                raise KeyError(
   2392                    f"Can't overwrite link '{zone} -> {links[zone]}' with '{target}'"
   2393                )
   2394            else:
   2395                print(
   2396                    f"Info: Link '{zone} -> {target}' can be removed from otherICULegacyLinks()"
   2397                )
   2398        links[zone] = target
   2399 
   2400    return (zones, links)
   2401 
   2402 
   2403 def otherICULegacyLinks():
   2404    """The file `icuTzDir`/tools/tzcode/icuzones contains all ICU legacy time
   2405    zones with the exception of time zones which are removed by IANA after an
   2406    ICU release.
   2407 
   2408    For example ICU 67 uses tzdata2018i, but tzdata2020b removed the link from
   2409    "US/Pacific-New" to "America/Los_Angeles". ICU standalone tzdata updates
   2410    don't include modified icuzones files, so we must manually record any IANA
   2411    modifications here.
   2412 
   2413    After an ICU update, we can remove any no longer needed entries from this
   2414    function by checking if the relevant entries are now included in icuzones.
   2415    """
   2416 
   2417    return {
   2418        # Current ICU is up-to-date with IANA, so this dict is empty.
   2419    }
   2420 
   2421 
   2422 def icuTzDataVersion(icuTzDir):
   2423    """Read the ICU time zone version from `icuTzDir`/zoneinfo64.txt."""
   2424 
   2425    def searchInFile(pattern, f):
   2426        p = re.compile(pattern)
   2427        for line in flines(f, "utf-8-sig"):
   2428            m = p.search(line)
   2429            if m:
   2430                return m.group(1)
   2431        return None
   2432 
   2433    zoneinfo = os.path.join(icuTzDir, "zoneinfo64.txt")
   2434    if not os.path.isfile(zoneinfo):
   2435        raise RuntimeError("file not found: %s" % zoneinfo)
   2436    version = searchInFile(r"^//\s+tz version:\s+([0-9]{4}[a-z])$", zoneinfo)
   2437    if version is None:
   2438        raise RuntimeError(
   2439            "%s does not contain a valid tzdata version string" % zoneinfo
   2440        )
   2441    return version
   2442 
   2443 
   2444 def findIncorrectICUZones(ianaZones, ianaLinks, icuZones, icuLinks):
   2445    """Find incorrect ICU zone entries."""
   2446 
   2447    def isIANATimeZone(zone):
   2448        return zone in ianaZones or zone in ianaLinks
   2449 
   2450    def isICUTimeZone(zone):
   2451        return zone in icuZones or zone in icuLinks
   2452 
   2453    def isICULink(zone):
   2454        return zone in icuLinks
   2455 
   2456    # All IANA zones should be present in ICU.
   2457    missingTimeZones = [zone for zone in ianaZones if not isICUTimeZone(zone)]
   2458    if missingTimeZones:
   2459        raise RuntimeError(
   2460            "Not all zones are present in ICU, did you forget "
   2461            "to run intl/update-tzdata.sh? %s" % missingTimeZones
   2462        )
   2463 
   2464    # Zones which are only present in ICU?
   2465    additionalTimeZones = [zone for zone in icuZones if not isIANATimeZone(zone)]
   2466    if additionalTimeZones:
   2467        raise RuntimeError(
   2468            "Additional zones present in ICU, did you forget "
   2469            "to run intl/update-tzdata.sh? %s" % additionalTimeZones
   2470        )
   2471 
   2472    # Zones which are marked as links in ICU.
   2473    result = ((zone, icuLinks[zone]) for zone in ianaZones if isICULink(zone))
   2474 
   2475    # Remove unnecessary UTC mappings.
   2476    utcnames = ["Etc/UTC", "Etc/UCT", "Etc/GMT"]
   2477    result = ((zone, target) for (zone, target) in result if zone.name not in utcnames)
   2478 
   2479    return sorted(result, key=itemgetter(0))
   2480 
   2481 
   2482 def findIncorrectICULinks(ianaZones, ianaLinks, icuZones, icuLinks):
   2483    """Find incorrect ICU link entries."""
   2484 
   2485    def isIANATimeZone(zone):
   2486        return zone in ianaZones or zone in ianaLinks
   2487 
   2488    def isICUTimeZone(zone):
   2489        return zone in icuZones or zone in icuLinks
   2490 
   2491    def isICULink(zone):
   2492        return zone in icuLinks
   2493 
   2494    def isICUZone(zone):
   2495        return zone in icuZones
   2496 
   2497    # All links should be present in ICU.
   2498    missingTimeZones = [zone for zone in ianaLinks.keys() if not isICUTimeZone(zone)]
   2499    if missingTimeZones:
   2500        raise RuntimeError(
   2501            "Not all zones are present in ICU, did you forget "
   2502            "to run intl/update-tzdata.sh? %s" % missingTimeZones
   2503        )
   2504 
   2505    # Links which are only present in ICU?
   2506    additionalTimeZones = [zone for zone in icuLinks.keys() if not isIANATimeZone(zone)]
   2507    if additionalTimeZones:
   2508        raise RuntimeError(
   2509            "Additional links present in ICU, did you forget "
   2510            "to run intl/update-tzdata.sh? %s" % additionalTimeZones
   2511        )
   2512 
   2513    result = chain(
   2514        # IANA links which have a different target in ICU.
   2515        (
   2516            (zone, target, icuLinks[zone])
   2517            for (zone, target) in ianaLinks.items()
   2518            if isICULink(zone) and target != icuLinks[zone]
   2519        ),
   2520        # IANA links which are zones in ICU.
   2521        (
   2522            (zone, target, zone.name)
   2523            for (zone, target) in ianaLinks.items()
   2524            if isICUZone(zone)
   2525        ),
   2526    )
   2527 
   2528    # Remove unnecessary UTC mappings.
   2529    utcnames = ["Etc/UTC", "Etc/UCT", "Etc/GMT"]
   2530    result = (
   2531        (zone, target, icuTarget)
   2532        for (zone, target, icuTarget) in result
   2533        if target not in utcnames or icuTarget not in utcnames
   2534    )
   2535 
   2536    return sorted(result, key=itemgetter(0))
   2537 
   2538 
   2539 def readZoneTab(tzdataDir):
   2540    zone_country = dict()
   2541 
   2542    zonetab_path = tzdataDir.resolve("zone.tab")
   2543    for line in tzdataDir.readlines(zonetab_path):
   2544        if line.startswith("#"):
   2545            continue
   2546        (country, coords, zone, *comments) = line.strip().split("\t")
   2547        assert zone not in zone_country
   2548        zone_country[zone] = country
   2549 
   2550    return zone_country
   2551 
   2552 
   2553 # 6.5.1 AvailableNamedTimeZoneIdentifiers ( )
   2554 #
   2555 # https://tc39.es/ecma402/#sup-availablenamedtimezoneidentifiers
   2556 def availableNamedTimeZoneIdentifiers(tzdataDir, ignoreFactory):
   2557    js_src_builtin_intl_dir = os.path.dirname(os.path.abspath(__file__))
   2558 
   2559    with open(
   2560        os.path.join(js_src_builtin_intl_dir, "TimeZoneMapping.yaml"),
   2561        encoding="utf-8",
   2562    ) as f:
   2563        time_zone_mapping = yaml.safe_load(f)
   2564 
   2565    zone_country = readZoneTab(tzdataDir)
   2566 
   2567    def country_code_for(name):
   2568        if name in zone_country:
   2569            return zone_country[name]
   2570        return time_zone_mapping[name]
   2571 
   2572    (ianaZones, ianaLinks) = readIANATimeZones(tzdataDir, ignoreFactory)
   2573 
   2574    (backzones, backlinks, packratlinks) = readIANAFiles(tzdataDir, ["backzone"])
   2575    all_backzone_links = {**backlinks, **packratlinks}
   2576 
   2577    # Steps 1-3. (Not applicable)
   2578 
   2579    # Step 4.
   2580    zones = set()
   2581    links = dict()
   2582 
   2583    # Step 5. (Partial, only zones)
   2584    for zone in ianaZones:
   2585        # Step 5.a.
   2586        primary = zone
   2587 
   2588        # Step 5.b. (Not applicable for zones)
   2589 
   2590        # Step 5.c.
   2591        if primary.name in ["Etc/UTC", "Etc/GMT", "GMT"]:
   2592            primary = Zone("UTC", primary.filename)
   2593 
   2594        # Step 5.d. (Not applicable)
   2595 
   2596        # Steps 5.e-f.
   2597        if primary == zone:
   2598            assert zone not in zones
   2599            zones.add(primary)
   2600        else:
   2601            assert zone not in links
   2602            links[zone] = primary.name
   2603 
   2604    # Step 5. (Partial, only links)
   2605    for zone, target in ianaLinks.items():
   2606        identifier = zone.name
   2607 
   2608        # Step 5.a.
   2609        primary = identifier
   2610 
   2611        # Step 5.b.
   2612        if identifier not in zone_country:
   2613            # Step 5.b.i. (Not applicable)
   2614 
   2615            # Steps 5.b.ii-iii.
   2616            if target.startswith("Etc/"):
   2617                primary = target
   2618            else:
   2619                # Step 5.b.iii.1.
   2620                identifier_code_code = country_code_for(identifier)
   2621 
   2622                # Step 5.b.iii.2.
   2623                target_code_code = country_code_for(target)
   2624 
   2625                # Steps 5.b.iii.3-4
   2626                if identifier_code_code == target_code_code:
   2627                    primary = target
   2628                else:
   2629                    # Step 5.b.iii.4.a.
   2630                    country_code_line_count = [
   2631                        zone
   2632                        for (zone, code) in zone_country.items()
   2633                        if code == identifier_code_code
   2634                    ]
   2635 
   2636                    # Steps 5.b.iii.4.b-c.
   2637                    if len(country_code_line_count) == 1:
   2638                        primary = country_code_line_count[0]
   2639                    else:
   2640                        assert Zone(identifier) in all_backzone_links
   2641                        primary = all_backzone_links[Zone(identifier)]
   2642                        assert identifier_code_code == country_code_for(primary)
   2643 
   2644        # Step 5.c.
   2645        if primary in ["Etc/UTC", "Etc/GMT", "GMT"]:
   2646            primary = "UTC"
   2647 
   2648        # Step 5.d. (Not applicable)
   2649 
   2650        # Steps 5.e-f.
   2651        if primary == identifier:
   2652            assert zone not in zones
   2653            zones.add(zone)
   2654        else:
   2655            assert zone not in links
   2656            links[zone] = primary
   2657 
   2658    # Ensure all zones and links are valid.
   2659    validateTimeZones(zones, links)
   2660 
   2661    # Step 6.
   2662    assert Zone("UTC") in zones
   2663 
   2664    # Step 7.
   2665    return (zones, links)
   2666 
   2667 
   2668 generatedFileWarning = "// Generated by make_intl_data.py. DO NOT EDIT."
   2669 tzdataVersionComment = "// tzdata version = {0}"
   2670 
   2671 
   2672 def processTimeZones(tzdataDir, icuDir, icuTzDir, version, ignoreFactory, out):
   2673    """Read the time zone info and create a new time zone cpp file."""
   2674    print("Processing tzdata mapping...")
   2675    (ianaZones, ianaLinks) = availableNamedTimeZoneIdentifiers(tzdataDir, ignoreFactory)
   2676    (icuZones, icuLinks) = readICUTimeZones(icuDir, icuTzDir, ignoreFactory)
   2677    (legacyZones, legacyLinks) = readICULegacyZones(icuDir)
   2678 
   2679    if ignoreFactory:
   2680        legacyZones.add(Zone("Factory"))
   2681 
   2682    # Remove all legacy ICU time zones.
   2683    icuZones = {zone for zone in icuZones if zone not in legacyZones}
   2684    icuLinks = {
   2685        zone: target for (zone, target) in icuLinks.items() if zone not in legacyLinks
   2686    }
   2687 
   2688    incorrectZones = findIncorrectICUZones(ianaZones, ianaLinks, icuZones, icuLinks)
   2689    if not incorrectZones:
   2690        print("<<< No incorrect ICU time zones found, please update Intl.js! >>>")
   2691        print("<<< Maybe https://ssl.icu-project.org/trac/ticket/12044 was fixed? >>>")
   2692 
   2693    incorrectLinks = findIncorrectICULinks(ianaZones, ianaLinks, icuZones, icuLinks)
   2694    if not incorrectLinks:
   2695        print("<<< No incorrect ICU time zone links found, please update Intl.js! >>>")
   2696        print("<<< Maybe https://ssl.icu-project.org/trac/ticket/12044 was fixed? >>>")
   2697 
   2698    print("Writing Intl tzdata file...")
   2699    with open(out, mode="w", encoding="utf-8", newline="") as f:
   2700        println = partial(print, file=f)
   2701 
   2702        println(generatedFileWarning)
   2703        println(tzdataVersionComment.format(version))
   2704        println("")
   2705 
   2706        println("#ifndef builtin_intl_TimeZoneDataGenerated_h")
   2707        println("#define builtin_intl_TimeZoneDataGenerated_h")
   2708        println("")
   2709 
   2710        println("namespace js {")
   2711        println("namespace timezone {")
   2712        println("")
   2713 
   2714        println("// Format:")
   2715        println('// "ZoneName" // ICU-Name [time zone file]')
   2716        println("const char* const ianaZonesTreatedAsLinksByICU[] = {")
   2717        for zone, icuZone in incorrectZones:
   2718            println('    "%s", // %s [%s]' % (zone, icuZone, zone.filename))
   2719        println("};")
   2720        println("")
   2721 
   2722        println("// Format:")
   2723        println('// "LinkName", "Target" // ICU-Target [time zone file]')
   2724        println("struct LinkAndTarget")
   2725        println("{")
   2726        println("    const char* const link;")
   2727        println("    const char* const target;")
   2728        println("};")
   2729        println("")
   2730        println("const LinkAndTarget ianaLinksCanonicalizedDifferentlyByICU[] = {")
   2731        for zone, target, icuTarget in incorrectLinks:
   2732            println(
   2733                '    { "%s", "%s" }, // %s [%s]'
   2734                % (zone, target, icuTarget, zone.filename)
   2735            )
   2736        println("};")
   2737        println("")
   2738 
   2739        println(
   2740            "// Legacy ICU time zones, these are not valid IANA time zone names. We also"
   2741        )
   2742        println("// disallow the old and deprecated System V time zones.")
   2743        println(
   2744            "// https://ssl.icu-project.org/repos/icu/trunk/icu4c/source/tools/tzcode/icuzones"
   2745        )  # NOQA: E501
   2746        println("const char* const legacyICUTimeZones[] = {")
   2747        for zone in chain(sorted(legacyLinks.keys()), sorted(legacyZones)):
   2748            println('    "%s",' % zone)
   2749        println("};")
   2750        println("")
   2751 
   2752        println("} // namespace timezone")
   2753        println("} // namespace js")
   2754        println("")
   2755        println("#endif /* builtin_intl_TimeZoneDataGenerated_h */")
   2756 
   2757 
   2758 def generateTzDataTestLinks(tzdataDir, version, ignoreFactory, testDir):
   2759    fileName = "timeZone_links.js"
   2760 
   2761    # Read zone and link infos.
   2762    (_, links) = availableNamedTimeZoneIdentifiers(tzdataDir, ignoreFactory)
   2763 
   2764    with open(
   2765        os.path.join(testDir, fileName), mode="w", encoding="utf-8", newline=""
   2766    ) as f:
   2767        println = partial(print, file=f)
   2768 
   2769        println('// |reftest| skip-if(!this.hasOwnProperty("Intl"))')
   2770        println("")
   2771        println(generatedFileWarning)
   2772        println(tzdataVersionComment.format(version))
   2773        println(
   2774            """
   2775 const tzMapper = [
   2776    x => x,
   2777    x => x.toUpperCase(),
   2778    x => x.toLowerCase(),
   2779 ];
   2780 """
   2781        )
   2782 
   2783        println("// Link names derived from IANA Time Zone Database.")
   2784        println("const links = {")
   2785        for zone, target in sorted(links.items(), key=itemgetter(0)):
   2786            println('    "%s": "%s",' % (zone, target))
   2787        println("};")
   2788 
   2789        println(
   2790            """
   2791 for (let [linkName, target] of Object.entries(links)) {
   2792    if (target === "Etc/UTC" || target === "Etc/GMT")
   2793        target = "UTC";
   2794 
   2795    for (let map of tzMapper) {
   2796        let dtf = new Intl.DateTimeFormat(undefined, {timeZone: map(linkName)});
   2797        let resolvedTimeZone = dtf.resolvedOptions().timeZone;
   2798        assertEq(resolvedTimeZone, target, `${linkName} -> ${target}`);
   2799    }
   2800 }
   2801 """
   2802        )
   2803        println(
   2804            """
   2805 if (typeof reportCompare === "function")
   2806    reportCompare(0, 0, "ok");
   2807 """
   2808        )
   2809 
   2810 
   2811 def generateTzDataTestVersion(tzdataDir, version, testDir):
   2812    fileName = "timeZone_version.js"
   2813 
   2814    with open(
   2815        os.path.join(testDir, fileName), mode="w", encoding="utf-8", newline=""
   2816    ) as f:
   2817        println = partial(print, file=f)
   2818 
   2819        println('// |reftest| skip-if(!this.hasOwnProperty("Intl"))')
   2820        println("")
   2821        println(generatedFileWarning)
   2822        println(tzdataVersionComment.format(version))
   2823        println(f"""const tzdata = "{version}";""")
   2824 
   2825        println(
   2826            """
   2827 if (typeof getICUOptions === "undefined") {
   2828    var getICUOptions = SpecialPowers.Cu.getJSTestingFunctions().getICUOptions;
   2829 }
   2830 
   2831 var options = getICUOptions();
   2832 
   2833 assertEq(options.tzdata, tzdata);
   2834 
   2835 if (typeof reportCompare === "function")
   2836    reportCompare(0, 0, "ok");
   2837 """
   2838        )
   2839 
   2840 
   2841 def generateTzDataTestCanonicalZones(tzdataDir, version, ignoreFactory, testDir):
   2842    fileName = "supportedValuesOf-timeZones-canonical.js"
   2843 
   2844    # Read zone and link infos.
   2845    (zones, _) = availableNamedTimeZoneIdentifiers(tzdataDir, ignoreFactory)
   2846 
   2847    with open(
   2848        os.path.join(testDir, fileName), mode="w", encoding="utf-8", newline=""
   2849    ) as f:
   2850        println = partial(print, file=f)
   2851 
   2852        println('// |reftest| skip-if(!this.hasOwnProperty("Intl"))')
   2853        println("")
   2854        println(generatedFileWarning)
   2855        println(tzdataVersionComment.format(version))
   2856 
   2857        println("const zones = [")
   2858        for zone in sorted(zones):
   2859            println(f'  "{zone}",')
   2860        println("];")
   2861 
   2862        println(
   2863            """
   2864 let supported = Intl.supportedValuesOf("timeZone");
   2865 
   2866 assertEqArray(supported, zones);
   2867 
   2868 if (typeof reportCompare === "function")
   2869    reportCompare(0, 0, "ok");
   2870 """
   2871        )
   2872 
   2873 
   2874 def generateTzDataTestZones(tzdataDir, version, ignoreFactory, testDir):
   2875    fileName = "zones-and-links.js"
   2876 
   2877    # Read zone and link infos.
   2878    (zones, links) = availableNamedTimeZoneIdentifiers(tzdataDir, ignoreFactory)
   2879 
   2880    with open(
   2881        os.path.join(testDir, fileName), mode="w", encoding="utf-8", newline=""
   2882    ) as f:
   2883        println = partial(print, file=f)
   2884 
   2885        println('// |reftest| skip-if(!this.hasOwnProperty("Temporal"))')
   2886        println("")
   2887        println(generatedFileWarning)
   2888        println(tzdataVersionComment.format(version))
   2889 
   2890        println("const zones = [")
   2891        for zone in sorted(zones):
   2892            println(f'  "{zone}",')
   2893        println("];")
   2894 
   2895        println("const links = {")
   2896        for link, target in sorted(links.items(), key=itemgetter(0)):
   2897            println(f'  "{link}": "{target}",')
   2898        println("};")
   2899 
   2900        println(
   2901            """
   2902 let epochNanoseconds = [
   2903  new Temporal.PlainDate(1900, 1, 1).toZonedDateTime("UTC").epochNanoseconds,
   2904  new Temporal.PlainDate(1950, 1, 1).toZonedDateTime("UTC").epochNanoseconds,
   2905  new Temporal.PlainDate(1960, 1, 1).toZonedDateTime("UTC").epochNanoseconds,
   2906  new Temporal.PlainDate(1970, 1, 1).toZonedDateTime("UTC").epochNanoseconds,
   2907  new Temporal.PlainDate(1980, 1, 1).toZonedDateTime("UTC").epochNanoseconds,
   2908  new Temporal.PlainDate(1990, 1, 1).toZonedDateTime("UTC").epochNanoseconds,
   2909  new Temporal.PlainDate(2000, 1, 1).toZonedDateTime("UTC").epochNanoseconds,
   2910  new Temporal.PlainDate(2010, 1, 1).toZonedDateTime("UTC").epochNanoseconds,
   2911  new Temporal.PlainDate(2020, 1, 1).toZonedDateTime("UTC").epochNanoseconds,
   2912  new Temporal.PlainDate(2030, 1, 1).toZonedDateTime("UTC").epochNanoseconds,
   2913 ];
   2914 
   2915 function timeZoneId(zdt) {
   2916  let str = zdt.toString();
   2917  let m = str.match(/(?<=\\[)[\\w\\/_+-]+(?=\\])/);
   2918  assertEq(m !== null, true, str);
   2919  return m[0];
   2920 }
   2921 
   2922 for (let zone of zones) {
   2923  let zdt = new Temporal.ZonedDateTime(0n, zone);
   2924 
   2925  assertEq(zdt.timeZoneId, zone);
   2926  assertEq(timeZoneId(zdt), zone);
   2927 }
   2928 
   2929 for (let [link, zone] of Object.entries(links)) {
   2930  assertEq(link === zone, false, `link=${link}, zone=${zone}`);
   2931  assertEq(zones.includes(zone), true, `zone=${zone}`);
   2932 
   2933  let zdtLink = new Temporal.ZonedDateTime(0n, link);
   2934  let zdtZone = new Temporal.ZonedDateTime(0n, zone);
   2935 
   2936  assertEq(zdtLink.timeZoneId, link);
   2937  assertEq(timeZoneId(zdtLink), link);
   2938 
   2939  assertEq(zdtZone.timeZoneId, zone);
   2940  assertEq(timeZoneId(zdtZone), zone);
   2941 
   2942  assertEq(zdtLink.equals(zdtZone), true, `link=${link}, zone=${zone}`);
   2943 
   2944  assertEq(
   2945    zdtLink.offsetNanoseconds,
   2946    zdtZone.offsetNanoseconds,
   2947    `link=${link}, zone=${zone}`
   2948  );
   2949 
   2950  for (let epochNs of epochNanoseconds) {
   2951    assertEq(
   2952      new Temporal.ZonedDateTime(epochNs, link).offsetNanoseconds,
   2953      new Temporal.ZonedDateTime(epochNs, zone).offsetNanoseconds,
   2954      `link=${link}, zone=${zone}, epochNs=${epochNs}`
   2955    );
   2956  }
   2957 }
   2958 
   2959 if (typeof reportCompare === "function")
   2960  reportCompare(0, 0, "ok");
   2961 """
   2962        )
   2963 
   2964 
   2965 def generateTzDataTests(tzdataDir, version, ignoreFactory, testDir):
   2966    dtfTestDir = os.path.join(testDir, "DateTimeFormat")
   2967    if not os.path.isdir(dtfTestDir):
   2968        raise RuntimeError("not a directory: %s" % dtfTestDir)
   2969 
   2970    zdtTestDir = os.path.join(testDir, "../Temporal/ZonedDateTime")
   2971    if not os.path.isdir(zdtTestDir):
   2972        raise RuntimeError("not a directory: %s" % zdtTestDir)
   2973 
   2974    generateTzDataTestLinks(tzdataDir, version, ignoreFactory, dtfTestDir)
   2975    generateTzDataTestVersion(tzdataDir, version, dtfTestDir)
   2976    generateTzDataTestCanonicalZones(tzdataDir, version, ignoreFactory, testDir)
   2977    generateTzDataTestZones(tzdataDir, version, ignoreFactory, zdtTestDir)
   2978 
   2979 
   2980 def updateTzdata(topsrcdir, args):
   2981    """Update the time zone cpp file."""
   2982 
   2983    icuDir = os.path.join(topsrcdir, "intl/icu/source")
   2984    if not os.path.isdir(icuDir):
   2985        raise RuntimeError("not a directory: %s" % icuDir)
   2986 
   2987    icuTzDir = os.path.join(topsrcdir, "intl/tzdata/source")
   2988    if not os.path.isdir(icuTzDir):
   2989        raise RuntimeError("not a directory: %s" % icuTzDir)
   2990 
   2991    intlTestDir = os.path.join(topsrcdir, "js/src/tests/non262/Intl")
   2992    if not os.path.isdir(intlTestDir):
   2993        raise RuntimeError("not a directory: %s" % intlTestDir)
   2994 
   2995    tzDir = args.tz
   2996    if tzDir is not None and not (os.path.isdir(tzDir) or os.path.isfile(tzDir)):
   2997        raise RuntimeError("not a directory or file: %s" % tzDir)
   2998    out = args.out
   2999 
   3000    # Ignore the placeholder time zone "Factory".
   3001    ignoreFactory = True
   3002 
   3003    version = icuTzDataVersion(icuTzDir)
   3004    url = (
   3005        "https://www.iana.org/time-zones/repository/releases/tzdata%s.tar.gz" % version
   3006    )
   3007 
   3008    print("Arguments:")
   3009    print("\ttzdata version: %s" % version)
   3010    print("\ttzdata URL: %s" % url)
   3011    print("\ttzdata directory|file: %s" % tzDir)
   3012    print("\tICU directory: %s" % icuDir)
   3013    print("\tICU timezone directory: %s" % icuTzDir)
   3014    print("\tOutput file: %s" % out)
   3015    print("")
   3016 
   3017    def updateFrom(f):
   3018        if os.path.isfile(f) and tarfile.is_tarfile(f):
   3019            with tarfile.open(f, "r:*") as tar:
   3020                processTimeZones(
   3021                    TzDataFile(tar),
   3022                    icuDir,
   3023                    icuTzDir,
   3024                    version,
   3025                    ignoreFactory,
   3026                    out,
   3027                )
   3028                generateTzDataTests(
   3029                    TzDataFile(tar), version, ignoreFactory, intlTestDir
   3030                )
   3031        elif os.path.isdir(f):
   3032            processTimeZones(
   3033                TzDataDir(f),
   3034                icuDir,
   3035                icuTzDir,
   3036                version,
   3037                ignoreFactory,
   3038                out,
   3039            )
   3040            generateTzDataTests(TzDataDir(f), version, ignoreFactory, intlTestDir)
   3041        else:
   3042            raise RuntimeError("unknown format")
   3043 
   3044    if tzDir is None:
   3045        print("Downloading tzdata file...")
   3046        with closing(urlopen(url)) as tzfile:
   3047            fname = urlsplit(tzfile.geturl()).path.split("/")[-1]
   3048            with tempfile.NamedTemporaryFile(suffix=fname) as tztmpfile:
   3049                print("File stored in %s" % tztmpfile.name)
   3050                tztmpfile.write(tzfile.read())
   3051                tztmpfile.flush()
   3052                updateFrom(tztmpfile.name)
   3053    else:
   3054        updateFrom(tzDir)
   3055 
   3056 
   3057 def readCurrencyFile(tree):
   3058    reCurrency = re.compile(r"^[A-Z]{3}$")
   3059    reIntMinorUnits = re.compile(r"^\d+$")
   3060 
   3061    for country in tree.iterfind(".//CcyNtry"):
   3062        # Skip entry if no currency information is available.
   3063        currency = country.findtext("Ccy")
   3064        if currency is None:
   3065            continue
   3066        assert reCurrency.match(currency)
   3067 
   3068        minorUnits = country.findtext("CcyMnrUnts")
   3069        assert minorUnits is not None
   3070 
   3071        # Skip all entries without minorUnits or which use the default minorUnits.
   3072        if reIntMinorUnits.match(minorUnits) and int(minorUnits) != 2:
   3073            currencyName = country.findtext("CcyNm")
   3074            countryName = country.findtext("CtryNm")
   3075            yield (currency, int(minorUnits), currencyName, countryName)
   3076 
   3077 
   3078 def writeCurrencyFile(published, currencies, out):
   3079    with open(out, mode="w", encoding="utf-8", newline="") as f:
   3080        println = partial(print, file=f)
   3081 
   3082        println(generatedFileWarning)
   3083        println(f"// Version: {published}")
   3084 
   3085        println(
   3086            """
   3087 /**
   3088 * Mapping from currency codes to the number of decimal digits used for them.
   3089 * Default is 2 digits.
   3090 *
   3091 * Spec: ISO 4217 Currency and Funds Code List.
   3092 * http://www.currency-iso.org/en/home/tables/table-a1.html
   3093 */"""
   3094        )
   3095        println("var currencyDigits = {")
   3096        for currency, entries in groupby(
   3097            sorted(currencies, key=itemgetter(0)), itemgetter(0)
   3098        ):
   3099            for _, minorUnits, currencyName, countryName in entries:
   3100                println(f"  // {currencyName} ({countryName})")
   3101            println(f"  {currency}: {minorUnits},")
   3102        println("};")
   3103 
   3104 
   3105 def updateCurrency(topsrcdir, args):
   3106    """Update the CurrencyDataGenerated.js file."""
   3107    import xml.etree.ElementTree as ET
   3108    from random import randint
   3109 
   3110    url = args.url
   3111    out = args.out
   3112    filename = args.file
   3113 
   3114    print("Arguments:")
   3115    print("\tDownload url: %s" % url)
   3116    print("\tLocal currency file: %s" % filename)
   3117    print("\tOutput file: %s" % out)
   3118    print("")
   3119 
   3120    def updateFrom(currencyFile):
   3121        print("Processing currency code list file...")
   3122        tree = ET.parse(currencyFile)
   3123        published = tree.getroot().attrib["Pblshd"]
   3124        currencies = readCurrencyFile(tree)
   3125 
   3126        print("Writing CurrencyData file...")
   3127        writeCurrencyFile(published, currencies, out)
   3128 
   3129    if filename is not None:
   3130        print("Always make sure you have the newest currency code list file!")
   3131        updateFrom(filename)
   3132    else:
   3133        print("Downloading currency & funds code list...")
   3134        request = UrlRequest(url)
   3135        request.add_header(
   3136            "User-agent",
   3137            "Mozilla/5.0 (Mobile; rv:{0}.0) Gecko/{0}.0 Firefox/{0}.0".format(
   3138                randint(1, 999)
   3139            ),
   3140        )
   3141        with closing(urlopen(request)) as currencyFile:
   3142            fname = urlsplit(currencyFile.geturl()).path.split("/")[-1]
   3143            with tempfile.NamedTemporaryFile(suffix=fname) as currencyTmpFile:
   3144                print("File stored in %s" % currencyTmpFile.name)
   3145                currencyTmpFile.write(currencyFile.read())
   3146                currencyTmpFile.flush()
   3147                updateFrom(currencyTmpFile.name)
   3148 
   3149 
   3150 def writeUnicodeExtensionsMappings(println, mapping, extension):
   3151    println(
   3152        f"""
   3153 template <size_t Length>
   3154 static inline bool Is{extension}Key(mozilla::Span<const char> key, const char (&str)[Length]) {{
   3155  static_assert(Length == {extension}KeyLength + 1,
   3156                "{extension} extension key is two characters long");
   3157  return memcmp(key.data(), str, Length - 1) == 0;
   3158 }}
   3159 
   3160 template <size_t Length>
   3161 static inline bool Is{extension}Type(mozilla::Span<const char> type, const char (&str)[Length]) {{
   3162  static_assert(Length > {extension}KeyLength + 1,
   3163                "{extension} extension type contains more than two characters");
   3164  return type.size() == (Length - 1) &&
   3165         memcmp(type.data(), str, Length - 1) == 0;
   3166 }}
   3167 """.rstrip("\n")
   3168    )
   3169 
   3170    linear_search_max_length = 4
   3171 
   3172    needs_binary_search = any(
   3173        len(replacements.items()) > linear_search_max_length
   3174        for replacements in mapping.values()
   3175    )
   3176 
   3177    if needs_binary_search:
   3178        println(
   3179            f"""
   3180 static int32_t Compare{extension}Type(const char* a, mozilla::Span<const char> b) {{
   3181  MOZ_ASSERT(!std::char_traits<char>::find(b.data(), b.size(), '\\0'),
   3182             "unexpected null-character in string");
   3183 
   3184  using UnsignedChar = unsigned char;
   3185  for (size_t i = 0; i < b.size(); i++) {{
   3186    // |a| is zero-terminated and |b| doesn't contain a null-terminator. So if
   3187    // we've reached the end of |a|, the below if-statement will always be true.
   3188    // That ensures we don't read past the end of |a|.
   3189    if (int32_t r = UnsignedChar(a[i]) - UnsignedChar(b[i])) {{
   3190      return r;
   3191    }}
   3192  }}
   3193 
   3194  // Return zero if both strings are equal or a positive number if |b| is a
   3195  // prefix of |a|.
   3196  return int32_t(UnsignedChar(a[b.size()]));
   3197 }}
   3198 
   3199 template <size_t Length>
   3200 static inline const char* Search{extension}Replacement(
   3201  const char* (&types)[Length], const char* (&aliases)[Length],
   3202  mozilla::Span<const char> type) {{
   3203 
   3204  auto p = std::lower_bound(std::begin(types), std::end(types), type,
   3205                            [](const auto& a, const auto& b) {{
   3206                              return Compare{extension}Type(a, b) < 0;
   3207                            }});
   3208  if (p != std::end(types) && Compare{extension}Type(*p, type) == 0) {{
   3209    return aliases[std::distance(std::begin(types), p)];
   3210  }}
   3211  return nullptr;
   3212 }}
   3213 """.rstrip("\n")
   3214        )
   3215 
   3216    println(
   3217        f"""
   3218 /**
   3219 * Mapping from deprecated BCP 47 {extension} extension types to their preferred
   3220 * values.
   3221 *
   3222 * Spec: https://www.unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files
   3223 * Spec: https://www.unicode.org/reports/tr35/#t_Extension
   3224 */
   3225 const char* mozilla::intl::Locale::Replace{extension}ExtensionType(
   3226    mozilla::Span<const char> key, mozilla::Span<const char> type) {{
   3227  MOZ_ASSERT(key.size() == {extension}KeyLength);
   3228  MOZ_ASSERT(IsCanonicallyCased{extension}Key(key));
   3229 
   3230  MOZ_ASSERT(type.size() > {extension}KeyLength);
   3231  MOZ_ASSERT(IsCanonicallyCased{extension}Type(type));
   3232 """
   3233    )
   3234 
   3235    def to_hash_key(replacements):
   3236        return str(sorted(replacements.items()))
   3237 
   3238    def write_array(subtags, name, length):
   3239        max_entries = (80 - len("    ")) // (length + len('"", '))
   3240 
   3241        println(f"    static const char* {name}[{len(subtags)}] = {{")
   3242 
   3243        for entries in grouper(subtags, max_entries):
   3244            entries = (
   3245                f'"{tag}"'.center(length + 2) for tag in entries if tag is not None
   3246            )
   3247            println("        {},".format(", ".join(entries)))
   3248 
   3249        println("    };")
   3250 
   3251    # Merge duplicate keys.
   3252    key_aliases = {}
   3253    for key, replacements in sorted(mapping.items(), key=itemgetter(0)):
   3254        hash_key = to_hash_key(replacements)
   3255        if hash_key not in key_aliases:
   3256            key_aliases[hash_key] = []
   3257        else:
   3258            key_aliases[hash_key].append(key)
   3259 
   3260    first_key = True
   3261    for key, replacements in sorted(mapping.items(), key=itemgetter(0)):
   3262        hash_key = to_hash_key(replacements)
   3263        if key in key_aliases[hash_key]:
   3264            continue
   3265 
   3266        cond = (f'Is{extension}Key(key, "{k}")' for k in [key] + key_aliases[hash_key])
   3267 
   3268        if_kind = "if" if first_key else "else if"
   3269        cond = (" ||\n" + " " * (2 + len(if_kind) + 2)).join(cond)
   3270        println(
   3271            f"""
   3272  {if_kind} ({cond}) {{""".strip("\n")
   3273        )
   3274        first_key = False
   3275 
   3276        replacements = sorted(replacements.items(), key=itemgetter(0))
   3277 
   3278        if len(replacements) > linear_search_max_length:
   3279            types = [t for (t, _) in replacements]
   3280            preferred = [r for (_, r) in replacements]
   3281            max_len = max(len(k) for k in types + preferred)
   3282 
   3283            write_array(types, "types", max_len)
   3284            write_array(preferred, "aliases", max_len)
   3285            println(
   3286                f"""
   3287    return Search{extension}Replacement(types, aliases, type);
   3288 """.strip("\n")
   3289            )
   3290        else:
   3291            for type, replacement in replacements:
   3292                println(
   3293                    f"""
   3294    if (Is{extension}Type(type, "{type}")) {{
   3295      return "{replacement}";
   3296    }}""".strip("\n")
   3297                )
   3298 
   3299        println(
   3300            """
   3301  }""".lstrip("\n")
   3302        )
   3303 
   3304    println(
   3305        """
   3306  return nullptr;
   3307 }
   3308 """.strip("\n")
   3309    )
   3310 
   3311 
   3312 def readICUUnitResourceFile(filepath):
   3313    """Return a set of unit descriptor pairs where the first entry denotes the unit type and the
   3314    second entry the unit name.
   3315 
   3316    Example:
   3317 
   3318    root{
   3319        units{
   3320            compound{
   3321            }
   3322            coordinate{
   3323            }
   3324            length{
   3325                meter{
   3326                }
   3327            }
   3328        }
   3329        unitsNarrow:alias{"/LOCALE/unitsShort"}
   3330        unitsShort{
   3331            duration{
   3332                day{
   3333                }
   3334                day-person:alias{"/LOCALE/unitsShort/duration/day"}
   3335            }
   3336            length{
   3337                meter{
   3338                }
   3339            }
   3340        }
   3341    }
   3342 
   3343    Returns {("length", "meter"), ("duration", "day"), ("duration", "day-person")}
   3344    """
   3345 
   3346    start_table_re = re.compile(r"^([\w\-%:\"]+)\{$")
   3347    end_table_re = re.compile(r"^\}$")
   3348    table_entry_re = re.compile(r"^([\w\-%:\"]+)\{\"(.*?)\"\}$")
   3349 
   3350    # The current resource table.
   3351    table = {}
   3352 
   3353    # List of parent tables when parsing.
   3354    parents = []
   3355 
   3356    # Track multi-line comments state.
   3357    in_multiline_comment = False
   3358 
   3359    for line in flines(filepath, "utf-8-sig"):
   3360        # Remove leading and trailing whitespace.
   3361        line = line.strip()
   3362 
   3363        # Skip over comments.
   3364        if in_multiline_comment:
   3365            if line.endswith("*/"):
   3366                in_multiline_comment = False
   3367            continue
   3368 
   3369        if line.startswith("//"):
   3370            continue
   3371 
   3372        if line.startswith("/*"):
   3373            in_multiline_comment = True
   3374            continue
   3375 
   3376        # Try to match the start of a table, e.g. `length{` or `meter{`.
   3377        match = start_table_re.match(line)
   3378        if match:
   3379            parents.append(table)
   3380            table_name = match.group(1)
   3381            new_table = {}
   3382            table[table_name] = new_table
   3383            table = new_table
   3384            continue
   3385 
   3386        # Try to match the end of a table.
   3387        match = end_table_re.match(line)
   3388        if match:
   3389            table = parents.pop()
   3390            continue
   3391 
   3392        # Try to match a table entry, e.g. `dnam{"meter"}`.
   3393        match = table_entry_re.match(line)
   3394        if match:
   3395            entry_key = match.group(1)
   3396            entry_value = match.group(2)
   3397            table[entry_key] = entry_value
   3398            continue
   3399 
   3400        raise Exception(f"unexpected line: '{line}' in {filepath}")
   3401 
   3402    assert len(parents) == 0, "Not all tables closed"
   3403    assert len(table) == 1, "More than one root table"
   3404 
   3405    # Remove the top-level language identifier table.
   3406    (_, unit_table) = table.popitem()
   3407 
   3408    # Add all units for the three display formats "units", "unitsNarrow", and "unitsShort".
   3409    # But exclude the pseudo-units "compound" and "ccoordinate".
   3410    return {
   3411        (unit_type, unit_name if not unit_name.endswith(":alias") else unit_name[:-6])
   3412        for unit_display in ("units", "unitsNarrow", "unitsShort")
   3413        if unit_display in unit_table
   3414        for (unit_type, unit_names) in unit_table[unit_display].items()
   3415        if unit_type not in {"compound", "coordinate"}
   3416        for unit_name in unit_names.keys()
   3417    }
   3418 
   3419 
   3420 def computeSupportedUnits(all_units, sanctioned_units):
   3421    """Given the set of all possible ICU unit identifiers and the set of sanctioned unit
   3422    identifiers, compute the set of effectively supported ICU unit identifiers.
   3423    """
   3424 
   3425    def find_match(unit):
   3426        unit_match = [
   3427            (unit_type, unit_name)
   3428            for (unit_type, unit_name) in all_units
   3429            if unit_name == unit
   3430        ]
   3431        if unit_match:
   3432            assert len(unit_match) == 1
   3433            return unit_match[0]
   3434        return None
   3435 
   3436    def compound_unit_identifiers():
   3437        for numerator in sanctioned_units:
   3438            for denominator in sanctioned_units:
   3439                yield f"{numerator}-per-{denominator}"
   3440 
   3441    supported_simple_units = {find_match(unit) for unit in sanctioned_units}
   3442    assert None not in supported_simple_units
   3443 
   3444    supported_compound_units = {
   3445        unit_match
   3446        for unit_match in (find_match(unit) for unit in compound_unit_identifiers())
   3447        if unit_match
   3448    }
   3449 
   3450    return supported_simple_units | supported_compound_units
   3451 
   3452 
   3453 def readICUDataFilterForUnits(data_filter_file):
   3454    with open(data_filter_file, encoding="utf-8") as f:
   3455        data_filter = json.load(f)
   3456 
   3457    # Find the rule set for the "unit_tree".
   3458    unit_tree_rules = [
   3459        entry["rules"]
   3460        for entry in data_filter["resourceFilters"]
   3461        if entry["categories"] == ["unit_tree"]
   3462    ]
   3463    assert len(unit_tree_rules) == 1
   3464 
   3465    # Compute the list of included units from that rule set. The regular expression must match
   3466    # "+/*/length/meter" and mustn't match either "-/*" or "+/*/compound".
   3467    included_unit_re = re.compile(r"^\+/\*/(.+?)/(.+)$")
   3468    filtered_units = (included_unit_re.match(unit) for unit in unit_tree_rules[0])
   3469 
   3470    return {(unit.group(1), unit.group(2)) for unit in filtered_units if unit}
   3471 
   3472 
   3473 def writeSanctionedSimpleUnitIdentifiersFiles(all_units, sanctioned_units):
   3474    js_src_builtin_intl_dir = os.path.dirname(os.path.abspath(__file__))
   3475    intl_components_src_dir = os.path.join(
   3476        js_src_builtin_intl_dir, "../../../../intl/components/src"
   3477    )
   3478 
   3479    def find_unit_type(unit):
   3480        result = [
   3481            unit_type for (unit_type, unit_name) in all_units if unit_name == unit
   3482        ]
   3483        assert result and len(result) == 1
   3484        return result[0]
   3485 
   3486    sanctioned_js_file = os.path.join(
   3487        js_src_builtin_intl_dir, "SanctionedSimpleUnitIdentifiersGenerated.js"
   3488    )
   3489    with open(sanctioned_js_file, mode="w", encoding="utf-8", newline="") as f:
   3490        println = partial(print, file=f)
   3491 
   3492        sanctioned_units_object = json.dumps(
   3493            {unit: True for unit in sorted(sanctioned_units)},
   3494            sort_keys=True,
   3495            indent=2,
   3496            separators=(",", ": "),
   3497        )
   3498 
   3499        println(generatedFileWarning)
   3500 
   3501        println(
   3502            """
   3503 /**
   3504 * The list of currently supported simple unit identifiers.
   3505 *
   3506 * Intl.NumberFormat Unified API Proposal
   3507 */"""
   3508        )
   3509 
   3510        println("// prettier-ignore")
   3511        println(f"var sanctionedSimpleUnitIdentifiers = {sanctioned_units_object};")
   3512 
   3513    sanctioned_h_file = os.path.join(intl_components_src_dir, "MeasureUnitGenerated.h")
   3514    with open(sanctioned_h_file, mode="w", encoding="utf-8", newline="") as f:
   3515        println = partial(print, file=f)
   3516 
   3517        println(generatedFileWarning)
   3518 
   3519        println(
   3520            """
   3521 #ifndef intl_components_MeasureUnitGenerated_h
   3522 #define intl_components_MeasureUnitGenerated_h
   3523 
   3524 namespace mozilla::intl {
   3525 
   3526 struct SimpleMeasureUnit {
   3527  const char* const type;
   3528  const char* const name;
   3529 };
   3530 
   3531 /**
   3532 * The list of currently supported simple unit identifiers.
   3533 *
   3534 * The list must be kept in alphabetical order of |name|.
   3535 */
   3536 inline constexpr SimpleMeasureUnit simpleMeasureUnits[] = {
   3537    // clang-format off"""
   3538        )
   3539 
   3540        for unit_name in sorted(sanctioned_units):
   3541            println(f'  {{"{find_unit_type(unit_name)}", "{unit_name}"}},')
   3542 
   3543        println(
   3544            """
   3545    // clang-format on
   3546 };
   3547 
   3548 }  // namespace mozilla::intl
   3549 
   3550 #endif
   3551 """.strip("\n")
   3552        )
   3553 
   3554    writeUnitTestFiles(all_units, sanctioned_units)
   3555 
   3556 
   3557 def writeUnitTestFiles(all_units, sanctioned_units):
   3558    """Generate test files for unit number formatters."""
   3559 
   3560    js_src_builtin_intl_dir = os.path.dirname(os.path.abspath(__file__))
   3561    test_dir = os.path.join(
   3562        js_src_builtin_intl_dir, "../../tests/non262/Intl/NumberFormat"
   3563    )
   3564 
   3565    def write_test(file_name, test_content, indent=4):
   3566        file_path = os.path.join(test_dir, file_name)
   3567        with open(file_path, mode="w", encoding="utf-8", newline="") as f:
   3568            println = partial(print, file=f)
   3569 
   3570            println('// |reftest| skip-if(!this.hasOwnProperty("Intl"))')
   3571            println("")
   3572            println(generatedFileWarning)
   3573            println("")
   3574 
   3575            sanctioned_units_array = json.dumps(
   3576                [unit for unit in sorted(sanctioned_units)],
   3577                indent=indent,
   3578                separators=(",", ": "),
   3579            )
   3580 
   3581            println(
   3582                f"const sanctionedSimpleUnitIdentifiers = {sanctioned_units_array};"
   3583            )
   3584 
   3585            println(test_content)
   3586 
   3587            println(
   3588                """
   3589 if (typeof reportCompare === "function")
   3590 {}reportCompare(true, true);""".format(" " * indent)
   3591            )
   3592 
   3593    write_test(
   3594        "unit-compound-combinations.js",
   3595        """
   3596 // Test all simple unit identifier combinations are allowed.
   3597 
   3598 for (const numerator of sanctionedSimpleUnitIdentifiers) {
   3599    for (const denominator of sanctionedSimpleUnitIdentifiers) {
   3600        const unit = `${numerator}-per-${denominator}`;
   3601        const nf = new Intl.NumberFormat("en", {style: "unit", unit});
   3602 
   3603        assertEq(nf.format(1), nf.formatToParts(1).map(p => p.value).join(""));
   3604    }
   3605 }""",
   3606    )
   3607 
   3608    all_units_array = json.dumps(
   3609        ["-".join(unit) for unit in sorted(all_units)], indent=4, separators=(",", ": ")
   3610    )
   3611 
   3612    write_test(
   3613        "unit-well-formed.js",
   3614        f"""
   3615 const allUnits = {all_units_array};
   3616 """
   3617        + r"""
   3618 // Test only sanctioned unit identifiers are allowed.
   3619 
   3620 for (const typeAndUnit of allUnits) {
   3621    const [_, type, unit] = typeAndUnit.match(/(\w+)-(.+)/);
   3622 
   3623    let allowed;
   3624    if (unit.includes("-per-")) {
   3625        const [numerator, denominator] = unit.split("-per-");
   3626        allowed = sanctionedSimpleUnitIdentifiers.includes(numerator) &&
   3627                  sanctionedSimpleUnitIdentifiers.includes(denominator);
   3628    } else {
   3629        allowed = sanctionedSimpleUnitIdentifiers.includes(unit);
   3630    }
   3631 
   3632    if (allowed) {
   3633        const nf = new Intl.NumberFormat("en", {style: "unit", unit});
   3634        assertEq(nf.format(1), nf.formatToParts(1).map(p => p.value).join(""));
   3635    } else {
   3636        assertThrowsInstanceOf(() => new Intl.NumberFormat("en", {style: "unit", unit}),
   3637                               RangeError, `Missing error for "${typeAndUnit}"`);
   3638    }
   3639 }""",
   3640    )
   3641 
   3642    write_test(
   3643        "unit-formatToParts-has-unit-field.js",
   3644        """
   3645 // Test only English and Chinese to keep the overall runtime reasonable.
   3646 //
   3647 // Chinese is included because it contains more than one "unit" element for
   3648 // certain unit combinations.
   3649 const locales = ["en", "zh"];
   3650 
   3651 // Plural rules for English only differentiate between "one" and "other". Plural
   3652 // rules for Chinese only use "other". That means we only need to test two values
   3653 // per unit.
   3654 const values = [0, 1];
   3655 
   3656 // Ensure unit formatters contain at least one "unit" element.
   3657 
   3658 for (const locale of locales) {
   3659  for (const unit of sanctionedSimpleUnitIdentifiers) {
   3660    const nf = new Intl.NumberFormat(locale, {style: "unit", unit});
   3661 
   3662    for (const value of values) {
   3663      assertEq(nf.formatToParts(value).some(e => e.type === "unit"), true,
   3664               `locale=${locale}, unit=${unit}`);
   3665    }
   3666  }
   3667 
   3668  for (const numerator of sanctionedSimpleUnitIdentifiers) {
   3669    for (const denominator of sanctionedSimpleUnitIdentifiers) {
   3670      const unit = `${numerator}-per-${denominator}`;
   3671      const nf = new Intl.NumberFormat(locale, {style: "unit", unit});
   3672 
   3673      for (const value of values) {
   3674        assertEq(nf.formatToParts(value).some(e => e.type === "unit"), true,
   3675                 `locale=${locale}, unit=${unit}`);
   3676      }
   3677    }
   3678  }
   3679 }""",
   3680        indent=2,
   3681    )
   3682 
   3683 
   3684 def updateUnits(topsrcdir, args):
   3685    js_src_builtin_intl_dir = os.path.dirname(os.path.abspath(__file__))
   3686    icu_path = os.path.join(topsrcdir, "intl", "icu")
   3687    icu_unit_path = os.path.join(icu_path, "source", "data", "unit")
   3688 
   3689    with open(
   3690        os.path.join(js_src_builtin_intl_dir, "SanctionedSimpleUnitIdentifiers.yaml"),
   3691        encoding="utf-8",
   3692    ) as f:
   3693        sanctioned_units = yaml.safe_load(f)
   3694 
   3695    # Read all possible ICU unit identifiers from the "unit/root.txt" resource.
   3696    unit_root_file = os.path.join(icu_unit_path, "root.txt")
   3697    all_units = readICUUnitResourceFile(unit_root_file)
   3698 
   3699    # Compute the set of effectively supported ICU unit identifiers.
   3700    supported_units = computeSupportedUnits(all_units, sanctioned_units)
   3701 
   3702    # Read the list of units we're including into the ICU data file.
   3703    data_filter_file = os.path.join(icu_path, "data_filter.json")
   3704    filtered_units = readICUDataFilterForUnits(data_filter_file)
   3705 
   3706    # Both sets must match to avoid resource loading errors at runtime.
   3707    if supported_units != filtered_units:
   3708 
   3709        def units_to_string(units):
   3710            return ", ".join("/".join(u) for u in units)
   3711 
   3712        missing = supported_units - filtered_units
   3713        if missing:
   3714            raise RuntimeError(f"Missing units: {units_to_string(missing)}")
   3715 
   3716        # Not exactly an error, but we currently don't have a use case where we need to support
   3717        # more units than required by ECMA-402.
   3718        extra = filtered_units - supported_units
   3719        if extra:
   3720            raise RuntimeError(f"Unnecessary units: {units_to_string(extra)}")
   3721 
   3722    writeSanctionedSimpleUnitIdentifiersFiles(all_units, sanctioned_units)
   3723 
   3724 
   3725 def readICUNumberingSystemsResourceFile(filepath):
   3726    """Returns a dictionary of numbering systems where the key denotes the numbering system name
   3727    and the value a dictionary with additional numbering system data.
   3728 
   3729    Example:
   3730 
   3731    numberingSystems:table(nofallback){
   3732        numberingSystems{
   3733            latn{
   3734                algorithmic:int{0}
   3735                desc{"0123456789"}
   3736                radix:int{10}
   3737            }
   3738            roman{
   3739                algorithmic:int{1}
   3740                desc{"%roman-upper"}
   3741                radix:int{10}
   3742            }
   3743        }
   3744    }
   3745 
   3746    Returns {"latn": {"digits": "0123456789", "algorithmic": False},
   3747             "roman": {"algorithmic": True}}
   3748    """
   3749 
   3750    start_table_re = re.compile(r"^(\w+)(?:\:[\w\(\)]+)?\{$")
   3751    end_table_re = re.compile(r"^\}$")
   3752    table_entry_re = re.compile(r"^(\w+)(?:\:[\w\(\)]+)?\{(?:(?:\"(.*?)\")|(\d+))\}$")
   3753 
   3754    # The current resource table.
   3755    table = {}
   3756 
   3757    # List of parent tables when parsing.
   3758    parents = []
   3759 
   3760    # Track multi-line comments state.
   3761    in_multiline_comment = False
   3762 
   3763    for line in flines(filepath, "utf-8-sig"):
   3764        # Remove leading and trailing whitespace.
   3765        line = line.strip()
   3766 
   3767        # Skip over comments.
   3768        if in_multiline_comment:
   3769            if line.endswith("*/"):
   3770                in_multiline_comment = False
   3771            continue
   3772 
   3773        if line.startswith("//"):
   3774            continue
   3775 
   3776        if line.startswith("/*"):
   3777            in_multiline_comment = True
   3778            continue
   3779 
   3780        # Try to match the start of a table, e.g. `latn{`.
   3781        match = start_table_re.match(line)
   3782        if match:
   3783            parents.append(table)
   3784            table_name = match.group(1)
   3785            new_table = {}
   3786            table[table_name] = new_table
   3787            table = new_table
   3788            continue
   3789 
   3790        # Try to match the end of a table.
   3791        match = end_table_re.match(line)
   3792        if match:
   3793            table = parents.pop()
   3794            continue
   3795 
   3796        # Try to match a table entry, e.g. `desc{"0123456789"}`.
   3797        match = table_entry_re.match(line)
   3798        if match:
   3799            entry_key = match.group(1)
   3800            entry_value = (
   3801                match.group(2) if match.group(2) is not None else int(match.group(3))
   3802            )
   3803            table[entry_key] = entry_value
   3804            continue
   3805 
   3806        raise Exception(f"unexpected line: '{line}' in {filepath}")
   3807 
   3808    assert len(parents) == 0, "Not all tables closed"
   3809    assert len(table) == 1, "More than one root table"
   3810 
   3811    # Remove the two top-level "numberingSystems" tables.
   3812    (_, numbering_systems) = table.popitem()
   3813    (_, numbering_systems) = numbering_systems.popitem()
   3814 
   3815    # Assert all numbering systems use base 10.
   3816    assert all(ns["radix"] == 10 for ns in numbering_systems.values())
   3817 
   3818    # Return the numbering systems.
   3819    return {
   3820        key: (
   3821            {"digits": value["desc"], "algorithmic": False}
   3822            if not bool(value["algorithmic"])
   3823            else {"algorithmic": True}
   3824        )
   3825        for (key, value) in numbering_systems.items()
   3826    }
   3827 
   3828 
   3829 def writeNumberingSystemFiles(numbering_systems):
   3830    js_src_builtin_intl_dir = os.path.dirname(os.path.abspath(__file__))
   3831 
   3832    numbering_systems_js_file = os.path.join(
   3833        js_src_builtin_intl_dir, "NumberingSystemsGenerated.h"
   3834    )
   3835    with open(numbering_systems_js_file, mode="w", encoding="utf-8", newline="") as f:
   3836        println = partial(print, file=f)
   3837 
   3838        println(generatedFileWarning)
   3839 
   3840        println(
   3841            """
   3842 /**
   3843 * The list of numbering systems with simple digit mappings.
   3844 */
   3845 
   3846 #ifndef builtin_intl_NumberingSystemsGenerated_h
   3847 #define builtin_intl_NumberingSystemsGenerated_h
   3848 """
   3849        )
   3850 
   3851        simple_numbering_systems = sorted(
   3852            name
   3853            for (name, value) in numbering_systems.items()
   3854            if not value["algorithmic"]
   3855        )
   3856 
   3857        println("// clang-format off")
   3858        println("#define NUMBERING_SYSTEMS_WITH_SIMPLE_DIGIT_MAPPINGS \\")
   3859        println(
   3860            "{}".format(
   3861                ", \\\n".join(f'  "{name}"' for name in simple_numbering_systems)
   3862            )
   3863        )
   3864        println("// clang-format on")
   3865        println("")
   3866 
   3867        println("#endif  // builtin_intl_NumberingSystemsGenerated_h")
   3868 
   3869    js_src_builtin_intl_dir = os.path.dirname(os.path.abspath(__file__))
   3870    test_dir = os.path.join(js_src_builtin_intl_dir, "../../tests/non262/Intl")
   3871 
   3872    intl_shell_js_file = os.path.join(test_dir, "shell.js")
   3873 
   3874    with open(intl_shell_js_file, mode="w", encoding="utf-8", newline="") as f:
   3875        println = partial(print, file=f)
   3876 
   3877        println(generatedFileWarning)
   3878 
   3879        println(
   3880            f"""
   3881 // source: CLDR file common/bcp47/number.xml; version CLDR {readCLDRVersionFromICU()}.
   3882 // https://github.com/unicode-org/cldr/blob/master/common/bcp47/number.xml
   3883 // https://github.com/unicode-org/cldr/blob/master/common/supplemental/numberingSystems.xml
   3884 """.rstrip()
   3885        )
   3886 
   3887        numbering_systems_object = json.dumps(
   3888            numbering_systems,
   3889            indent=2,
   3890            separators=(",", ": "),
   3891            sort_keys=True,
   3892            ensure_ascii=False,
   3893        )
   3894        println(f"const numberingSystems = {numbering_systems_object};")
   3895 
   3896 
   3897 def updateNumberingSystems(topsrcdir, args):
   3898    js_src_builtin_intl_dir = os.path.dirname(os.path.abspath(__file__))
   3899    icu_path = os.path.join(topsrcdir, "intl", "icu")
   3900    icu_misc_path = os.path.join(icu_path, "source", "data", "misc")
   3901 
   3902    with open(
   3903        os.path.join(js_src_builtin_intl_dir, "NumberingSystems.yaml"),
   3904        encoding="utf-8",
   3905    ) as f:
   3906        numbering_systems = yaml.safe_load(f)
   3907 
   3908    # Read all possible ICU unit identifiers from the "misc/numberingSystems.txt" resource.
   3909    misc_ns_file = os.path.join(icu_misc_path, "numberingSystems.txt")
   3910    all_numbering_systems = readICUNumberingSystemsResourceFile(misc_ns_file)
   3911 
   3912    all_numbering_systems_simple_digits = {
   3913        name
   3914        for (name, value) in all_numbering_systems.items()
   3915        if not value["algorithmic"]
   3916    }
   3917 
   3918    # Assert ICU includes support for all required numbering systems. If this assertion fails,
   3919    # something is broken in ICU.
   3920    assert all_numbering_systems_simple_digits.issuperset(numbering_systems), (
   3921        f"{numbering_systems.difference(all_numbering_systems_simple_digits)}"
   3922    )
   3923 
   3924    # Assert the spec requires support for all numbering systems with simple digit mappings. If
   3925    # this assertion fails, file a PR at <https://github.com/tc39/ecma402> to include any new
   3926    # numbering systems.
   3927    assert all_numbering_systems_simple_digits.issubset(numbering_systems), (
   3928        f"{all_numbering_systems_simple_digits.difference(numbering_systems)}"
   3929    )
   3930 
   3931    writeNumberingSystemFiles(all_numbering_systems)
   3932 
   3933 
   3934 if __name__ == "__main__":
   3935    import argparse
   3936 
   3937    # This script must reside in js/src/builtin/intl to work correctly.
   3938    (thisDir, thisFile) = os.path.split(os.path.abspath(__file__))
   3939    dirPaths = os.path.normpath(thisDir).split(os.sep)
   3940    if "/".join(dirPaths[-4:]) != "js/src/builtin/intl":
   3941        raise RuntimeError("%s must reside in js/src/builtin/intl" % __file__)
   3942    topsrcdir = "/".join(dirPaths[:-4])
   3943 
   3944    def EnsureHttps(v):
   3945        if not v.startswith("https:"):
   3946            raise argparse.ArgumentTypeError("URL protocol must be https: " % v)
   3947        return v
   3948 
   3949    parser = argparse.ArgumentParser(description="Update intl data.")
   3950    subparsers = parser.add_subparsers(help="Select update mode")
   3951 
   3952    parser_cldr_tags = subparsers.add_parser(
   3953        "langtags", help="Update CLDR language tags data"
   3954    )
   3955    parser_cldr_tags.add_argument(
   3956        "--version", metavar="VERSION", help="CLDR version number"
   3957    )
   3958    parser_cldr_tags.add_argument(
   3959        "--url",
   3960        metavar="URL",
   3961        default="https://unicode.org/Public/cldr/<VERSION>/cldr-common-<VERSION>.zip",
   3962        type=EnsureHttps,
   3963        help="Download url CLDR data (default: %(default)s)",
   3964    )
   3965    parser_cldr_tags.add_argument(
   3966        "--out",
   3967        default=os.path.join(
   3968            topsrcdir, "intl", "components", "src", "LocaleGenerated.cpp"
   3969        ),
   3970        help="Output file (default: %(default)s)",
   3971    )
   3972    parser_cldr_tags.add_argument(
   3973        "file", nargs="?", help="Local cldr-common.zip file, if omitted uses <URL>"
   3974    )
   3975    parser_cldr_tags.set_defaults(func=updateCLDRLangTags)
   3976 
   3977    parser_tz = subparsers.add_parser("tzdata", help="Update tzdata")
   3978    parser_tz.add_argument(
   3979        "--tz",
   3980        help="Local tzdata directory or file, if omitted downloads tzdata "
   3981        "distribution from https://www.iana.org/time-zones/",
   3982    )
   3983    parser_tz.add_argument(
   3984        "--out",
   3985        default=os.path.join(thisDir, "TimeZoneDataGenerated.h"),
   3986        help="Output file (default: %(default)s)",
   3987    )
   3988    parser_tz.set_defaults(func=partial(updateTzdata, topsrcdir))
   3989 
   3990    parser_currency = subparsers.add_parser(
   3991        "currency", help="Update currency digits mapping"
   3992    )
   3993    parser_currency.add_argument(
   3994        "--url",
   3995        metavar="URL",
   3996        default="https://www.six-group.com/dam/download/financial-information/data-center/iso-currrency/lists/list-one.xml",  # NOQA: E501
   3997        type=EnsureHttps,
   3998        help="Download url for the currency & funds code list (default: %(default)s)",
   3999    )
   4000    parser_currency.add_argument(
   4001        "--out",
   4002        default=os.path.join(thisDir, "CurrencyDataGenerated.js"),
   4003        help="Output file (default: %(default)s)",
   4004    )
   4005    parser_currency.add_argument(
   4006        "file", nargs="?", help="Local currency code list file, if omitted uses <URL>"
   4007    )
   4008    parser_currency.set_defaults(func=partial(updateCurrency, topsrcdir))
   4009 
   4010    parser_units = subparsers.add_parser(
   4011        "units", help="Update sanctioned unit identifiers mapping"
   4012    )
   4013    parser_units.set_defaults(func=partial(updateUnits, topsrcdir))
   4014 
   4015    parser_numbering_systems = subparsers.add_parser(
   4016        "numbering", help="Update numbering systems with simple digit mappings"
   4017    )
   4018    parser_numbering_systems.set_defaults(
   4019        func=partial(updateNumberingSystems, topsrcdir)
   4020    )
   4021 
   4022    args = parser.parse_args()
   4023    args.func(args)