tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

make_unicode.py (50312B)


      1 #!/usr/bin/env python
      2 # Based upon makeunicodedata.py
      3 # (http://hg.python.org/cpython/file/c8192197d23d/Tools/unicode/makeunicodedata.py)
      4 # written by Fredrik Lundh (fredrik@pythonware.com)
      5 #
      6 #    Copyright (C) 2011 Tom Schuster <evilpies@gmail.com>
      7 #
      8 #    This program is free software: you can redistribute it and/or modify
      9 #    it under the terms of the GNU General Public License as published by
     10 #    the Free Software Foundation, either version 3 of the License, or
     11 #    (at your option) any later version.
     12 #
     13 #    This program is distributed in the hope that it will be useful,
     14 #    but WITHOUT ANY WARRANTY; without even the implied warranty of
     15 #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     16 #    GNU General Public License for more details.
     17 #
     18 #    You should have received a copy of the GNU General Public License
     19 #    along with this program.  If not, see <http://www.gnu.org/licenses/>.
     20 
     21 import csv
     22 import io
     23 import os
     24 import re
     25 import sys
     26 from contextlib import closing
     27 from functools import partial
     28 from itertools import chain, tee, zip_longest
     29 from operator import is_not, itemgetter
     30 from urllib.request import urlopen
     31 from zipfile import ZipFile
     32 
     33 
     34 class codepoint_dict(dict):
     35    def name(self, code_point):
     36        (_, _, name, alias) = self[code_point]
     37        return "{}{}".format(name, (" (" + alias + ")" if alias else ""))
     38 
     39    def full_name(self, code_point):
     40        (_, _, name, alias) = self[code_point]
     41        return "U+{:04X} {}{}".format(
     42            code_point, name, (" (" + alias + ")" if alias else "")
     43        )
     44 
     45 
     46 # ECMAScript 2016
     47 # §11.2 White Space
     48 whitespace = [
     49    # python doesn't support using control character names :(
     50    0x9,  # CHARACTER TABULATION
     51    0xB,  # LINE TABULATION
     52    0xC,  # FORM FEED
     53    ord("\N{SPACE}"),
     54    ord("\N{NO-BREAK SPACE}"),
     55    ord("\N{ZERO WIDTH NO-BREAK SPACE}"),  # also BOM
     56 ]
     57 
     58 # §11.3 Line Terminators
     59 line_terminator = [
     60    0xA,  # LINE FEED
     61    0xD,  # CARRIAGE RETURN
     62    ord("\N{LINE SEPARATOR}"),
     63    ord("\N{PARAGRAPH SEPARATOR}"),
     64 ]
     65 
     66 # These are also part of IdentifierPart §11.6 Names and Keywords
     67 compatibility_identifier_part = [
     68    ord("\N{ZERO WIDTH NON-JOINER}"),
     69    ord("\N{ZERO WIDTH JOINER}"),
     70 ]
     71 
     72 FLAG_SPACE = 1 << 0
     73 FLAG_UNICODE_ID_START = 1 << 1
     74 FLAG_UNICODE_ID_CONTINUE_ONLY = 1 << 2
     75 
     76 MAX_BMP = 0xFFFF
     77 
     78 public_domain = """
     79 /*
     80 * Any copyright is dedicated to the Public Domain.
     81 * http://creativecommons.org/licenses/publicdomain/
     82 */
     83 """
     84 
     85 mpl_license = """\
     86 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
     87 * vim: set ts=8 sts=4 et sw=4 tw=99:
     88 * This Source Code Form is subject to the terms of the Mozilla Public
     89 * License, v. 2.0. If a copy of the MPL was not distributed with this
     90 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     91 """
     92 
     93 warning_message = """\
     94 /* Generated by make_unicode.py DO NOT MODIFY */
     95 """
     96 
     97 unicode_version_message = """\
     98 /* Unicode version: {0} */
     99 """
    100 
    101 
    102 def read_unicode_data(unicode_data):
    103    """
    104    If you want to understand how this wonderful file format works checkout
    105      Unicode Standard Annex #44 - Unicode Character Database
    106      http://www.unicode.org/reports/tr44/
    107    """
    108 
    109    reader = csv.reader(unicode_data, delimiter=";")
    110 
    111    while True:
    112        row = next(reader, None)
    113        if row is None:
    114            return
    115        name = row[1]
    116 
    117        # We need to expand the UAX #44 4.2.3 Code Point Range
    118        if name.startswith("<") and name.endswith("First>"):
    119            next_row = next(reader)
    120 
    121            for i in range(int(row[0], 16), int(next_row[0], 16) + 1):
    122                row[0] = i
    123                row[1] = name[1:-8]
    124 
    125                yield row
    126        else:
    127            row[0] = int(row[0], 16)
    128            yield row
    129 
    130 
    131 def read_case_folding(case_folding):
    132    """
    133    File format is:
    134    <code>; <status>; <mapping>; # <name>
    135    """
    136 
    137    for line in case_folding:
    138        if line == "\n" or line.startswith("#"):
    139            continue
    140        row = line.split("; ")
    141        if row[1] in ["F", "T"]:
    142            continue
    143        assert row[1] in ["C", "S"], "expect either (C)ommon or (S)imple case foldings"
    144        code = int(row[0], 16)
    145        mapping = int(row[2], 16)
    146        yield (code, mapping)
    147 
    148 
    149 def read_derived_core_properties(derived_core_properties):
    150    for line in derived_core_properties:
    151        if line == "\n" or line.startswith("#"):
    152            continue
    153        row = line.split("#")[0].split(";")
    154        char_range = row[0].strip()
    155        char_property = row[1].strip()
    156        if ".." not in char_range:
    157            yield (int(char_range, 16), char_property)
    158        else:
    159            [start, end] = char_range.split("..")
    160            for char in range(int(start, 16), int(end, 16) + 1):
    161                yield (char, char_property)
    162 
    163 
    164 def read_special_casing(special_casing):
    165    # Format:
    166    # <code>; <lower>; <title>; <upper>; (<condition_list>;)? # <comment>
    167    for line in special_casing:
    168        if line == "\n" or line.startswith("#"):
    169            continue
    170        row = line.split("#")[0].split(";")
    171        code = int(row[0].strip(), 16)
    172        lower = row[1].strip()
    173        lower = [int(c, 16) for c in lower.split(" ")] if lower else []
    174        upper = row[3].strip()
    175        upper = [int(c, 16) for c in upper.split(" ")] if upper else []
    176        languages = []
    177        contexts = []
    178        condition = row[4].strip()
    179        if condition:
    180            for cond in condition.split(" "):
    181                if cond[0].islower():
    182                    languages.append(cond)
    183                else:
    184                    contexts.append(cond)
    185            pass
    186        yield (code, lower, upper, languages, contexts)
    187 
    188 
    189 def int_ranges(ints):
    190    """Yields consecutive ranges (inclusive) from integer values."""
    191    (a, b) = tee(sorted(ints))
    192    start = next(b)
    193    for curr, succ in zip_longest(a, b):
    194        if curr + 1 != succ:
    195            yield (start, curr)
    196            start = succ
    197 
    198 
    199 def utf16_encode(code):
    200    NonBMPMin = 0x10000
    201    LeadSurrogateMin = 0xD800
    202    TrailSurrogateMin = 0xDC00
    203 
    204    lead = (code - NonBMPMin) // 1024 + LeadSurrogateMin
    205    trail = ((code - NonBMPMin) % 1024) + TrailSurrogateMin
    206 
    207    return lead, trail
    208 
    209 
    210 def make_non_bmp_convert_macro(out_file, name, convert_map, codepoint_table):
    211    # Find continuous range in convert_map.
    212    convert_list = []
    213    entry = None
    214    for code in sorted(convert_map.keys()):
    215        lead, trail = utf16_encode(code)
    216        converted = convert_map[code]
    217        diff = converted - code
    218 
    219        if (
    220            entry
    221            and code == entry["code"] + entry["length"]
    222            and diff == entry["diff"]
    223            and lead == entry["lead"]
    224        ):
    225            entry["length"] += 1
    226            continue
    227 
    228        entry = {
    229            "code": code,
    230            "diff": diff,
    231            "length": 1,
    232            "lead": lead,
    233            "trail": trail,
    234        }
    235        convert_list.append(entry)
    236 
    237    # Generate macro call for each range.
    238    lines = []
    239    comment = []
    240    for entry in convert_list:
    241        from_code = entry["code"]
    242        to_code = entry["code"] + entry["length"] - 1
    243        diff = entry["diff"]
    244 
    245        lead = entry["lead"]
    246        from_trail = entry["trail"]
    247        to_trail = entry["trail"] + entry["length"] - 1
    248 
    249        lines.append(
    250            f"    MACRO(0x{from_code:x}, 0x{to_code:x}, 0x{lead:x}, 0x{from_trail:x}, 0x{to_trail:x}, {diff:d})"
    251        )
    252        comment.append(
    253            f"// {codepoint_table.full_name(from_code)} .. {codepoint_table.full_name(to_code)}"
    254        )
    255 
    256    out_file.write("\n".join(comment))
    257    out_file.write("\n")
    258    out_file.write(f"#define FOR_EACH_NON_BMP_{name}(MACRO) \\\n")
    259    out_file.write(" \\\n".join(lines))
    260    out_file.write("\n")
    261 
    262 
    263 def process_derived_core_properties(derived_core_properties):
    264    id_start = set()
    265    id_continue = set()
    266 
    267    for char, prop in read_derived_core_properties(derived_core_properties):
    268        if prop == "ID_Start":
    269            id_start.add(char)
    270        if prop == "ID_Continue":
    271            id_continue.add(char)
    272 
    273    return (id_start, id_continue)
    274 
    275 
    276 def process_unicode_data(unicode_data, derived_core_properties):
    277    dummy = (0, 0, 0)
    278    table = [dummy]
    279    cache = {dummy: 0}
    280    index = [0] * (MAX_BMP + 1)
    281 
    282    codepoint_table = codepoint_dict()
    283    test_space_table = []
    284 
    285    non_bmp_lower_map = {}
    286    non_bmp_upper_map = {}
    287    non_bmp_id_start_set = {}
    288    non_bmp_id_cont_set = {}
    289    non_bmp_space_set = {}
    290 
    291    (id_start, id_continue) = process_derived_core_properties(derived_core_properties)
    292 
    293    for row in read_unicode_data(unicode_data):
    294        code = row[0]
    295        name = row[1]
    296        category = row[2]
    297        alias = row[-5]
    298        uppercase = row[-3]
    299        lowercase = row[-2]
    300 
    301        if uppercase:
    302            upper = int(uppercase, 16)
    303        else:
    304            upper = code
    305 
    306        if lowercase:
    307            lower = int(lowercase, 16)
    308        else:
    309            lower = code
    310 
    311        codepoint_table[code] = (upper, lower, name, alias)
    312 
    313        if code > MAX_BMP:
    314            if code != lower:
    315                non_bmp_lower_map[code] = lower
    316            if code != upper:
    317                non_bmp_upper_map[code] = upper
    318            if category == "Zs":
    319                non_bmp_space_set[code] = 1
    320                test_space_table.append(code)
    321            if code in id_start:
    322                non_bmp_id_start_set[code] = 1
    323            if code in id_continue:
    324                non_bmp_id_cont_set[code] = 1
    325            continue
    326 
    327        assert lower <= MAX_BMP and upper <= MAX_BMP
    328 
    329        flags = 0
    330 
    331        # we combine whitespace and lineterminators because in pratice we don't need them separated
    332        if category == "Zs" or code in whitespace or code in line_terminator:
    333            flags |= FLAG_SPACE
    334            test_space_table.append(code)
    335 
    336        # §11.6 (IdentifierStart)
    337        if code in id_start:
    338            flags |= FLAG_UNICODE_ID_START
    339 
    340        # §11.6 (IdentifierPart)
    341        elif code in id_continue or code in compatibility_identifier_part:
    342            flags |= FLAG_UNICODE_ID_CONTINUE_ONLY
    343 
    344        up_d = upper - code
    345        low_d = lower - code
    346 
    347        assert up_d > -65535 and up_d < 65535
    348        assert low_d > -65535 and low_d < 65535
    349 
    350        upper = up_d & 0xFFFF
    351        lower = low_d & 0xFFFF
    352 
    353        item = (upper, lower, flags)
    354 
    355        i = cache.get(item)
    356        if i is None:
    357            assert item not in table
    358            cache[item] = i = len(table)
    359            table.append(item)
    360        index[code] = i
    361 
    362    return (
    363        table,
    364        index,
    365        non_bmp_lower_map,
    366        non_bmp_upper_map,
    367        non_bmp_space_set,
    368        non_bmp_id_start_set,
    369        non_bmp_id_cont_set,
    370        codepoint_table,
    371        test_space_table,
    372    )
    373 
    374 
    375 def process_case_folding(case_folding):
    376    folding_map = {}
    377    rev_folding_map = {}
    378    folding_dummy = (0,)
    379    folding_table = [folding_dummy]
    380    folding_cache = {folding_dummy: 0}
    381    folding_index = [0] * (MAX_BMP + 1)
    382 
    383    folding_tests = []
    384    folding_codes = set()
    385 
    386    for code, mapping in read_case_folding(case_folding):
    387        folding_map[code] = mapping
    388 
    389        if mapping not in rev_folding_map:
    390            rev_folding_map[mapping] = [code]
    391        else:
    392            rev_folding_map[mapping].append(code)
    393 
    394        folding_codes.add(code)
    395        folding_codes.add(mapping)
    396 
    397    for code in sorted(folding_codes):
    398        if code in folding_map:
    399            folding = folding_map[code]
    400        else:
    401            folding = code
    402 
    403        if code in rev_folding_map:
    404            rev_folding = rev_folding_map[code]
    405        elif folding in rev_folding_map:
    406            rev_folding = [c for c in rev_folding_map[folding] if c != code]
    407        else:
    408            rev_folding = []
    409 
    410        if folding != code or rev_folding:
    411            item = [code]
    412            if folding != code:
    413                item.append(folding)
    414            folding_tests.append(item + rev_folding)
    415 
    416        if code > MAX_BMP:
    417            continue
    418 
    419        folding_d = folding - code
    420 
    421        assert folding_d > -65535 and folding_d < 65535
    422 
    423        folding = folding_d & 0xFFFF
    424 
    425        item = (folding,)
    426 
    427        i = folding_cache.get(item)
    428        if i is None:
    429            assert item not in folding_table
    430            folding_cache[item] = i = len(folding_table)
    431            folding_table.append(item)
    432        folding_index[code] = i
    433    return (folding_table, folding_index, folding_tests)
    434 
    435 
    436 def process_special_casing(special_casing, table, index):
    437    # Unconditional special casing.
    438    unconditional_tolower = {}
    439    unconditional_toupper = {}
    440 
    441    # Conditional special casing, language independent.
    442    conditional_tolower = {}
    443    conditional_toupper = {}
    444 
    445    # Conditional special casing, language dependent.
    446    lang_conditional_tolower = {}
    447    lang_conditional_toupper = {}
    448 
    449    def caseInfo(code):
    450        (upper, lower, flags) = table[index[code]]
    451        return ((code + lower) & 0xFFFF, (code + upper) & 0xFFFF)
    452 
    453    for code, lower, upper, languages, contexts in read_special_casing(special_casing):
    454        assert code <= MAX_BMP, "Unexpected character outside of BMP: %s" % code
    455        assert len(languages) <= 1, "Expected zero or one language ids: %s" % languages
    456        assert len(contexts) <= 1, (
    457            "Expected zero or one casing contexts: %s" % languages
    458        )
    459 
    460        (default_lower, default_upper) = caseInfo(code)
    461        special_lower = len(lower) != 1 or lower[0] != default_lower
    462        special_upper = len(upper) != 1 or upper[0] != default_upper
    463 
    464        # Invariant: If |code| has casing per UnicodeData.txt, then it also has
    465        # casing rules in SpecialCasing.txt.
    466        assert code == default_lower or len(lower) != 1 or code != lower[0]
    467        assert code == default_upper or len(upper) != 1 or code != upper[0]
    468 
    469        language = languages[0] if languages else None
    470        context = contexts[0] if contexts else None
    471 
    472        if not language and not context:
    473            if special_lower:
    474                unconditional_tolower[code] = lower
    475            if special_upper:
    476                unconditional_toupper[code] = upper
    477        elif not language and context:
    478            if special_lower:
    479                conditional_tolower[code] = (lower, context)
    480            if special_upper:
    481                conditional_toupper[code] = (upper, context)
    482        else:
    483            if language not in lang_conditional_tolower:
    484                lang_conditional_tolower[language] = {}
    485                lang_conditional_toupper[language] = {}
    486            if special_lower:
    487                lang_conditional_tolower[language][code] = (lower, context)
    488            if special_upper:
    489                lang_conditional_toupper[language][code] = (upper, context)
    490 
    491    # Certain special casing rules are inlined in jsstr.cpp, ensure these cases
    492    # still match the current SpecialCasing.txt file.
    493    def lowerCase(code):
    494        (lower, _) = caseInfo(code)
    495        return lower
    496 
    497    def upperCase(code):
    498        (_, upper) = caseInfo(code)
    499        return upper
    500 
    501    def ascii(char_dict):
    502        return (ch for ch in char_dict.keys() if ch <= 0x7F)
    503 
    504    def latin1(char_dict):
    505        return (ch for ch in char_dict.keys() if ch <= 0xFF)
    506 
    507    def is_empty(iterable):
    508        return not any(True for _ in iterable)
    509 
    510    def is_equals(iter1, iter2):
    511        return all(x == y for (x, y) in zip_longest(iter1, iter2))
    512 
    513    # Ensure no ASCII characters have special case mappings.
    514    assert is_empty(ascii(unconditional_tolower))
    515    assert is_empty(ascii(unconditional_toupper))
    516    assert is_empty(ascii(conditional_tolower))
    517    assert is_empty(ascii(conditional_toupper))
    518 
    519    # Ensure no Latin1 characters have special lower case mappings.
    520    assert is_empty(latin1(unconditional_tolower))
    521    assert is_empty(latin1(conditional_tolower))
    522 
    523    # Ensure no Latin1 characters have conditional special upper case mappings.
    524    assert is_empty(latin1(conditional_toupper))
    525 
    526    # Ensure U+00DF is the only Latin1 character with a special upper case mapping.
    527    assert is_equals([0x00DF], latin1(unconditional_toupper))
    528 
    529    # Ensure U+0130 is the only character with a special lower case mapping.
    530    assert is_equals([0x0130], unconditional_tolower)
    531 
    532    # Ensure no characters have language independent conditional upper case mappings.
    533    assert is_empty(conditional_toupper)
    534 
    535    # Ensure U+03A3 is the only character with language independent conditional lower case mapping.
    536    assert is_equals([0x03A3], conditional_tolower)
    537 
    538    # Verify U+0130 and U+03A3 have simple lower case mappings.
    539    assert all(ch != lowerCase(ch) for ch in [0x0130, 0x03A3])
    540 
    541    # Ensure Azeri, Lithuanian, and Turkish are the only languages with conditional case mappings.
    542    assert is_equals(["az", "lt", "tr"], sorted(lang_conditional_tolower.keys()))
    543    assert is_equals(["az", "lt", "tr"], sorted(lang_conditional_toupper.keys()))
    544 
    545    # Maximum case mapping length is three characters.
    546    assert (
    547        max(
    548            map(
    549                len,
    550                chain(
    551                    unconditional_tolower.values(),
    552                    unconditional_toupper.values(),
    553                    map(itemgetter(0), conditional_tolower.values()),
    554                    map(itemgetter(0), conditional_toupper.values()),
    555                    map(
    556                        itemgetter(0),
    557                        chain.from_iterable(
    558                            d.values() for d in lang_conditional_tolower.values()
    559                        ),
    560                    ),
    561                    map(
    562                        itemgetter(0),
    563                        chain.from_iterable(
    564                            d.values() for d in lang_conditional_toupper.values()
    565                        ),
    566                    ),
    567                ),
    568            )
    569        )
    570        <= 3
    571    )
    572 
    573    # Ensure all case mapping contexts are known (see Unicode 9.0, §3.13 Default Case Algorithms).
    574    assert set([
    575        "After_I",
    576        "After_Soft_Dotted",
    577        "Final_Sigma",
    578        "More_Above",
    579        "Not_Before_Dot",
    580    ]).issuperset(
    581        set(
    582            filter(
    583                partial(is_not, None),
    584                chain(
    585                    map(itemgetter(1), conditional_tolower.values()),
    586                    map(itemgetter(1), conditional_toupper.values()),
    587                    map(
    588                        itemgetter(1),
    589                        chain.from_iterable(
    590                            d.values() for d in lang_conditional_tolower.values()
    591                        ),
    592                    ),
    593                    map(
    594                        itemgetter(1),
    595                        chain.from_iterable(
    596                            d.values() for d in lang_conditional_toupper.values()
    597                        ),
    598                    ),
    599                ),
    600            )
    601        )
    602    )
    603 
    604    # Special casing for U+00DF (LATIN SMALL LETTER SHARP S).
    605    assert upperCase(0x00DF) == 0x00DF and unconditional_toupper[0x00DF] == [
    606        0x0053,
    607        0x0053,
    608    ]
    609 
    610    # Special casing for U+0130 (LATIN CAPITAL LETTER I WITH DOT ABOVE).
    611    assert unconditional_tolower[0x0130] == [0x0069, 0x0307]
    612 
    613    # Special casing for U+03A3 (GREEK CAPITAL LETTER SIGMA).
    614    assert lowerCase(0x03A3) == 0x03C3 and conditional_tolower[0x03A3] == (
    615        [0x03C2],
    616        "Final_Sigma",
    617    )
    618 
    619    return (unconditional_tolower, unconditional_toupper)
    620 
    621 
    622 def make_non_bmp_file(version, non_bmp_lower_map, non_bmp_upper_map, codepoint_table):
    623    file_name = "UnicodeNonBMP.h"
    624    with open(file_name, mode="w", encoding="utf-8") as non_bmp_file:
    625        non_bmp_file.write(mpl_license)
    626        non_bmp_file.write("\n")
    627        non_bmp_file.write(warning_message)
    628        non_bmp_file.write(unicode_version_message.format(version))
    629        non_bmp_file.write(
    630            """
    631 #ifndef util_UnicodeNonBMP_h
    632 #define util_UnicodeNonBMP_h
    633 
    634 // |MACRO| receives the following arguments
    635 //   MACRO(FROM, TO, LEAD, TRAIL_FROM, TRAIL_TO, DIFF)
    636 //     FROM:       code point where the range starts
    637 //     TO:         code point where the range ends
    638 //     LEAD:       common lead surrogate of FROM and TO
    639 //     TRAIL_FROM: trail surrogate of FROM
    640 //     TRAIL_FROM: trail surrogate of TO
    641 //     DIFF:       the difference between the code point in the range and
    642 //                 converted code point
    643 
    644 """
    645        )
    646 
    647        make_non_bmp_convert_macro(
    648            non_bmp_file, "LOWERCASE", non_bmp_lower_map, codepoint_table
    649        )
    650        non_bmp_file.write("\n")
    651        make_non_bmp_convert_macro(
    652            non_bmp_file, "UPPERCASE", non_bmp_upper_map, codepoint_table
    653        )
    654 
    655        non_bmp_file.write(
    656            """
    657 #endif /* util_UnicodeNonBMP_h */
    658 """
    659        )
    660 
    661 
    662 def write_special_casing_methods(unconditional_toupper, codepoint_table, println):
    663    def hexlit(n):
    664        """Returns C++ hex-literal for |n|."""
    665        return f"0x{n:04X}"
    666 
    667    def describe_range(ranges, depth):
    668        indent = depth * "    "
    669        for start, end in ranges:
    670            if start == end:
    671                println(indent, f"// {codepoint_table.full_name(start)}")
    672            else:
    673                println(
    674                    indent,
    675                    f"// {codepoint_table.full_name(start)} .. {codepoint_table.full_name(end)}",
    676                )
    677 
    678    def out_range(start, end):
    679        """Tests if the input character isn't a member of the set {x | start <= x <= end}."""
    680        if start == end:
    681            return f"ch != {hexlit(start)}"
    682        return f"ch < {hexlit(start)} || ch > {hexlit(end)}"
    683 
    684    def in_range(start, end, parenthesize=False):
    685        """Tests if the input character is in the set {x | start <= x <= end}."""
    686        if start == end:
    687            return f"ch == {hexlit(start)}"
    688        (left, right) = ("(", ")") if parenthesize else ("", "")
    689        return f"{left}ch >= {hexlit(start)} && ch <= {hexlit(end)}{right}"
    690 
    691    def in_any_range(ranges, spaces):
    692        """Tests if the input character is included in any of the given ranges."""
    693        lines = [[]]
    694        for start, end in ranges:
    695            expr = in_range(start, end, parenthesize=True)
    696            line = " || ".join(lines[-1] + [expr])
    697            if len(line) < (100 - len(spaces) - len(" ||")):
    698                lines[-1].append(expr)
    699            else:
    700                lines.append([expr])
    701        return f" ||\n{spaces}".join(" || ".join(t) for t in lines)
    702 
    703    def write_range_accept(parent_list, child_list, depth):
    704        """Accepts the input character if it matches any code unit in |child_list|."""
    705        (min_parent, max_parent) = (parent_list[0], parent_list[-1])
    706        (min_child, max_child) = (child_list[0], child_list[-1])
    707        assert min_child >= min_parent
    708        assert max_child <= max_parent
    709        indent = depth * "    "
    710 
    711        child_ranges = list(int_ranges(child_list))
    712        has_successor = max_child != max_parent
    713 
    714        # If |child_list| is a contiguous list of code units, emit a simple
    715        # range check: |min_child <= input <= max_child|.
    716        if len(child_ranges) == 1:
    717            describe_range(child_ranges, depth)
    718            if has_successor:
    719                println(indent, f"if (ch <= {hexlit(max_child)}) {{")
    720                println(indent, f"    return ch >= {hexlit(min_child)};")
    721                println(indent, "}")
    722            else:
    723                println(indent, f"return {in_range(min_child, max_child)};")
    724            return
    725 
    726        # Otherwise create a disjunction over the subranges in |child_ranges|.
    727        if not has_successor:
    728            spaces = indent + len("return ") * " "
    729        else:
    730            spaces = indent + len("    return ") * " "
    731        range_test_expr = in_any_range(child_ranges, spaces)
    732 
    733        if min_child != min_parent:
    734            println(indent, f"if (ch < {hexlit(min_child)}) {{")
    735            println(indent, "    return false;")
    736            println(indent, "}")
    737 
    738        # If there's no successor block, we can omit the |input <= max_child| check,
    739        # because it was already checked when we emitted the parent range test.
    740        if not has_successor:
    741            describe_range(child_ranges, depth)
    742            println(indent, f"return {range_test_expr};")
    743        else:
    744            println(indent, f"if (ch <= {hexlit(max_child)}) {{")
    745            describe_range(child_ranges, depth + 1)
    746            println(indent, f"    return {range_test_expr};")
    747            println(indent, "}")
    748 
    749    def write_ChangesWhenUpperCasedSpecialCasing():
    750        """Checks if the input has a special upper case mapping."""
    751        println("bool")
    752        println("js::unicode::ChangesWhenUpperCasedSpecialCasing(char16_t ch)")
    753        println("{")
    754 
    755        assert unconditional_toupper, "|unconditional_toupper| is not empty"
    756 
    757        # Sorted list of code units with special upper case mappings.
    758        code_list = sorted(unconditional_toupper.keys())
    759 
    760        # Fail-fast if the input character isn't a special casing character.
    761        println(f"    if ({out_range(code_list[0], code_list[-1])}) {{")
    762        println("        return false;")
    763        println("    }")
    764 
    765        for i in range(0, 16):
    766            # Check if the input characters is in the range:
    767            # |start_point <= input < end_point|.
    768            start_point = i << 12
    769            end_point = (i + 1) << 12
    770            matches = [cu for cu in code_list if start_point <= cu < end_point]
    771 
    772            # Skip empty ranges.
    773            if not matches:
    774                continue
    775 
    776            # If |matches| consists of only a few characters, directly check
    777            # the input against the characters in |matches|.
    778            if len(matches) <= 8:
    779                write_range_accept(code_list, matches, depth=1)
    780                continue
    781 
    782            # Otherwise split into further subranges.
    783 
    784            # Only enter the if-block if the input is less-or-equals to the
    785            # largest value in the current range.
    786            is_last_block = matches[-1] == code_list[-1]
    787            if not is_last_block:
    788                println(f"    if (ch <= {hexlit(matches[-1])}) {{")
    789            else:
    790                println(f"    if (ch < {hexlit(matches[0])}) {{")
    791                println("        return false;")
    792                println("    }")
    793 
    794            for j in range(0, 16):
    795                inner_start = start_point + (j << 8)
    796                inner_end = start_point + ((j + 1) << 8)
    797                inner_matches = [cu for cu in matches if inner_start <= cu < inner_end]
    798 
    799                if inner_matches:
    800                    d = 1 if is_last_block else 2
    801                    write_range_accept(matches, inner_matches, depth=d)
    802 
    803            if not is_last_block:
    804                println("    }")
    805 
    806        println("}")
    807 
    808    def write_LengthUpperCaseSpecialCasing():
    809        """Slow case: Special casing character was found, returns its mapping length."""
    810        println("size_t")
    811        println("js::unicode::LengthUpperCaseSpecialCasing(char16_t ch)")
    812        println("{")
    813 
    814        println("    switch(ch) {")
    815        for code, converted in sorted(unconditional_toupper.items(), key=itemgetter(0)):
    816            println(
    817                f"      case {hexlit(code)}: return {len(converted)}; // {codepoint_table.name(code)}"
    818            )
    819        println("    }")
    820        println("")
    821        println('    MOZ_ASSERT_UNREACHABLE("Bad character input.");')
    822        println("    return 0;")
    823 
    824        println("}")
    825 
    826    def write_AppendUpperCaseSpecialCasing():
    827        """Slow case: Special casing character was found, append its mapping characters."""
    828        println("void")
    829        println(
    830            "js::unicode::AppendUpperCaseSpecialCasing(char16_t ch, char16_t* elements, size_t* index)"  # NOQA: E501
    831        )
    832        println("{")
    833 
    834        println("    switch(ch) {")
    835        for code, converted in sorted(unconditional_toupper.items(), key=itemgetter(0)):
    836            println(f"      case {hexlit(code)}: // {codepoint_table.name(code)}")
    837            for ch in converted:
    838                println(
    839                    f"        elements[(*index)++] = {hexlit(ch)}; // {codepoint_table.name(ch)}"
    840                )
    841            println("        return;")
    842        println("    }")
    843        println("")
    844        println('    MOZ_ASSERT_UNREACHABLE("Bad character input.");')
    845 
    846        println("}")
    847 
    848    write_ChangesWhenUpperCasedSpecialCasing()
    849    println("")
    850    write_LengthUpperCaseSpecialCasing()
    851    println("")
    852    write_AppendUpperCaseSpecialCasing()
    853 
    854 
    855 def write_ascii_lookup_tables(table, index, write, println):
    856    def is_id_compat(code):
    857        return code == ord("\N{DOLLAR SIGN}") or code == ord("\N{LOW LINE}")
    858 
    859    def is_id_start(code):
    860        (upper, lower, flags) = table[index[code]]
    861        return (flags & FLAG_UNICODE_ID_START) or is_id_compat(code)
    862 
    863    def is_id_continue(code):
    864        (upper, lower, flags) = table[index[code]]
    865        return (flags & FLAG_UNICODE_ID_CONTINUE_ONLY) or is_id_start(code)
    866 
    867    def is_space(code):
    868        (upper, lower, flags) = table[index[code]]
    869        return flags & FLAG_SPACE
    870 
    871    def write_entries(name, predicate):
    872        println(f"const bool unicode::{name}[] = {{")
    873        header = "".join(f"{x: <6}" for x in range(0, 10)).rstrip()
    874        println(f"/*       {header}  */")
    875        for i in range(0, 13):
    876            write(f"/* {i: >2} */")
    877            for j in range(0, 10):
    878                code = i * 10 + j
    879                if code <= 0x7F:
    880                    write(" {},".format("true" if predicate(code) else "____"))
    881            println("")
    882        println("};")
    883 
    884    println("")
    885    println("#define ____ false")
    886 
    887    println(
    888        """
    889 /*
    890 * Identifier start chars:
    891 * -      36:    $
    892 * -  65..90: A..Z
    893 * -      95:    _
    894 * - 97..122: a..z
    895 */"""
    896    )
    897    write_entries("js_isidstart", is_id_start)
    898 
    899    println(
    900        """
    901 /*
    902 * Identifier chars:
    903 * -      36:    $
    904 * -  48..57: 0..9
    905 * -  65..90: A..Z
    906 * -      95:    _
    907 * - 97..122: a..z
    908 */"""
    909    )
    910    write_entries("js_isident", is_id_continue)
    911 
    912    println(
    913        """
    914 /* Whitespace chars: '\\t', '\\n', '\\v', '\\f', '\\r', ' '. */"""
    915    )
    916    write_entries("js_isspace", is_space)
    917 
    918    println("")
    919    println("#undef ____")
    920 
    921 
    922 def write_latin1_lookup_tables(table, index, write, println):
    923    def case_info(code):
    924        assert 0 <= code <= MAX_BMP
    925        (upper, lower, flags) = table[index[code]]
    926        return ((code + upper) & 0xFFFF, (code + lower) & 0xFFFF, flags)
    927 
    928    def toLowerCase(code):
    929        (_, lower, _) = case_info(code)
    930        assert lower <= 0xFF, "lower-case of Latin-1 is always Latin-1"
    931        return lower
    932 
    933    def write_entries(name, mapper):
    934        println(f"const JS::Latin1Char unicode::{name}[] = {{")
    935        header = "".join(f"{x: <6}" for x in range(0, 16)).rstrip()
    936        println(f"/*       {header}  */")
    937        for i in range(0, 16):
    938            write(f"/* {i: >2} */")
    939            for j in range(0, 16):
    940                code = i * 16 + j
    941                if code <= 0xFF:
    942                    write(f" 0x{mapper(code):02X},")
    943            println("")
    944        println("};")
    945 
    946    println("")
    947    write_entries("latin1ToLowerCaseTable", toLowerCase)
    948 
    949 
    950 def make_bmp_mapping_test(
    951    version, codepoint_table, unconditional_tolower, unconditional_toupper
    952 ):
    953    def unicodeEsc(n):
    954        return f"\\u{n:04X}"
    955 
    956    file_name = "../tests/non262/String/string-upper-lower-mapping.js"
    957    with open(file_name, mode="w", encoding="utf-8") as output:
    958        write = partial(print, file=output, sep="", end="")
    959        println = partial(print, file=output, sep="", end="\n")
    960 
    961        write(warning_message)
    962        write(unicode_version_message.format(version))
    963        write(public_domain)
    964        println("var mapping = [")
    965        for code in range(0, MAX_BMP + 1):
    966            entry = codepoint_table.get(code)
    967 
    968            if entry:
    969                (upper, lower, _, _) = entry
    970                upper = (
    971                    unconditional_toupper[code]
    972                    if code in unconditional_toupper
    973                    else [upper]
    974                )
    975                lower = (
    976                    unconditional_tolower[code]
    977                    if code in unconditional_tolower
    978                    else [lower]
    979                )
    980                println(
    981                    '  ["{}", "{}"], /* {} */'.format(
    982                        "".join(map(unicodeEsc, upper)),
    983                        "".join(map(unicodeEsc, lower)),
    984                        codepoint_table.name(code),
    985                    )
    986                )
    987            else:
    988                println('  ["{0}", "{0}"],'.format(unicodeEsc(code)))
    989        println("];")
    990        write(
    991            """
    992 assertEq(mapping.length, 0x10000);
    993 for (var i = 0; i <= 0xffff; i++) {
    994    var char = String.fromCharCode(i);
    995    var info = mapping[i];
    996 
    997    assertEq(char.toUpperCase(), info[0]);
    998    assertEq(char.toLowerCase(), info[1]);
    999 }
   1000 
   1001 if (typeof reportCompare === "function")
   1002    reportCompare(true, true);
   1003 """
   1004        )
   1005 
   1006 
   1007 def make_non_bmp_mapping_test(
   1008    version, non_bmp_upper_map, non_bmp_lower_map, codepoint_table
   1009 ):
   1010    file_name = "../tests/non262/String/string-code-point-upper-lower-mapping.js"
   1011    with open(file_name, mode="w", encoding="utf-8") as test_non_bmp_mapping:
   1012        test_non_bmp_mapping.write(warning_message)
   1013        test_non_bmp_mapping.write(unicode_version_message.format(version))
   1014        test_non_bmp_mapping.write(public_domain)
   1015 
   1016        for code in sorted(non_bmp_upper_map.keys()):
   1017            test_non_bmp_mapping.write(
   1018                f"""\
   1019 assertEq(String.fromCodePoint(0x{code:04X}).toUpperCase().codePointAt(0), 0x{non_bmp_upper_map[code]:04X}); // {codepoint_table.name(code)}, {codepoint_table.name(non_bmp_upper_map[code])}
   1020 """
   1021            )
   1022 
   1023        for code in sorted(non_bmp_lower_map.keys()):
   1024            test_non_bmp_mapping.write(
   1025                f"""\
   1026 assertEq(String.fromCodePoint(0x{code:04X}).toLowerCase().codePointAt(0), 0x{non_bmp_lower_map[code]:04X}); // {codepoint_table.name(code)}, {codepoint_table.name(non_bmp_lower_map[code])}
   1027 """
   1028            )
   1029 
   1030        test_non_bmp_mapping.write(
   1031            """
   1032 if (typeof reportCompare === "function")
   1033    reportCompare(true, true);
   1034 """
   1035        )
   1036 
   1037 
   1038 def make_space_test(version, test_space_table, codepoint_table):
   1039    def hex_and_name(c):
   1040        return f"    0x{c:04X} /* {codepoint_table.name(c)} */"
   1041 
   1042    file_name = "../tests/non262/String/string-space-trim.js"
   1043    with open(file_name, mode="w", encoding="utf-8") as test_space:
   1044        test_space.write(warning_message)
   1045        test_space.write(unicode_version_message.format(version))
   1046        test_space.write(public_domain)
   1047        test_space.write("var onlySpace = String.fromCharCode(\n")
   1048        test_space.write(",\n".join(map(hex_and_name, test_space_table)))
   1049        test_space.write("\n);\n")
   1050        test_space.write(
   1051            """
   1052 assertEq(onlySpace.trim(), "");
   1053 assertEq((onlySpace + 'aaaa').trim(), 'aaaa');
   1054 assertEq(('aaaa' + onlySpace).trim(), 'aaaa');
   1055 assertEq((onlySpace + 'aaaa' + onlySpace).trim(), 'aaaa');
   1056 
   1057 if (typeof reportCompare === "function")
   1058    reportCompare(true, true);
   1059 """
   1060        )
   1061 
   1062 
   1063 def make_regexp_space_test(version, test_space_table, codepoint_table):
   1064    def hex_and_name(c):
   1065        return f"    0x{c:04X} /* {codepoint_table.name(c)} */"
   1066 
   1067    file_name = "../tests/non262/RegExp/character-class-escape-s.js"
   1068    with open(file_name, mode="w", encoding="utf-8") as test_space:
   1069        test_space.write(warning_message)
   1070        test_space.write(unicode_version_message.format(version))
   1071        test_space.write(public_domain)
   1072        test_space.write("var onlySpace = String.fromCodePoint(\n")
   1073        test_space.write(",\n".join(map(hex_and_name, test_space_table)))
   1074        test_space.write("\n);\n")
   1075        test_space.write(
   1076            r"""
   1077 assertEq(/^\s+$/.exec(onlySpace) !== null, true);
   1078 assertEq(/^[\s]+$/.exec(onlySpace) !== null, true);
   1079 assertEq(/^[^\s]+$/.exec(onlySpace) === null, true);
   1080 
   1081 assertEq(/^\S+$/.exec(onlySpace) === null, true);
   1082 assertEq(/^[\S]+$/.exec(onlySpace) === null, true);
   1083 assertEq(/^[^\S]+$/.exec(onlySpace) !== null, true);
   1084 
   1085 // Also test with Unicode RegExps.
   1086 assertEq(/^\s+$/u.exec(onlySpace) !== null, true);
   1087 assertEq(/^[\s]+$/u.exec(onlySpace) !== null, true);
   1088 assertEq(/^[^\s]+$/u.exec(onlySpace) === null, true);
   1089 
   1090 assertEq(/^\S+$/u.exec(onlySpace) === null, true);
   1091 assertEq(/^[\S]+$/u.exec(onlySpace) === null, true);
   1092 assertEq(/^[^\S]+$/u.exec(onlySpace) !== null, true);
   1093 
   1094 if (typeof reportCompare === "function")
   1095    reportCompare(true, true);
   1096 """
   1097        )
   1098 
   1099 
   1100 def make_icase_test(version, folding_tests, codepoint_table):
   1101    def char_hex(c):
   1102        return f"0x{c:04X}"
   1103 
   1104    file_name = "../tests/non262/RegExp/unicode-ignoreCase.js"
   1105    with open(file_name, mode="w", encoding="utf-8") as test_icase:
   1106        test_icase.write("// |reftest| skip-if(!this.hasOwnProperty('Intl'))\n\n")
   1107        test_icase.write(warning_message)
   1108        test_icase.write(unicode_version_message.format(version))
   1109        test_icase.write(public_domain)
   1110        test_icase.write(
   1111            """
   1112 var BUGNUMBER = 1135377;
   1113 var summary = "Implement RegExp unicode flag -- ignoreCase flag.";
   1114 
   1115 print(BUGNUMBER + ": " + summary);
   1116 
   1117 function test(code, ...equivs) {
   1118  var codeRe = new RegExp(String.fromCodePoint(code) + "+", "iu");
   1119  var ans = String.fromCodePoint(code) + equivs.map(c => String.fromCodePoint(c)).join("");
   1120  assertEqArray(codeRe.exec("<" + ans + ">"), [ans]);
   1121  codeRe = new RegExp("[" + String.fromCodePoint(code) + "]+", "iu");
   1122  assertEqArray(codeRe.exec("<" + ans + ">"), [ans]);
   1123 }
   1124 """
   1125        )
   1126        for args in folding_tests:
   1127            test_icase.write(
   1128                "test({}); // {}\n".format(
   1129                    ", ".join(map(char_hex, args)),
   1130                    ", ".join(map(codepoint_table.name, args)),
   1131                )
   1132            )
   1133        test_icase.write(
   1134            """
   1135 if (typeof reportCompare === "function")
   1136    reportCompare(true, true);
   1137 """
   1138        )
   1139 
   1140 
   1141 def make_unicode_file(
   1142    version,
   1143    table,
   1144    index,
   1145    folding_table,
   1146    folding_index,
   1147    non_bmp_space_set,
   1148    non_bmp_id_start_set,
   1149    non_bmp_id_cont_set,
   1150    unconditional_toupper,
   1151    codepoint_table,
   1152 ):
   1153    index1, index2, shift = splitbins(index)
   1154 
   1155    # Don't forget to update CharInfo in Unicode.h if you need to change this
   1156    assert shift == 6
   1157 
   1158    folding_index1, folding_index2, folding_shift = splitbins(folding_index)
   1159 
   1160    # Don't forget to update CaseFoldInfo in Unicode.h if you need to change this
   1161    assert folding_shift == 5
   1162 
   1163    # verify correctness
   1164    for char in index:
   1165        test = table[index[char]]
   1166 
   1167        idx = index1[char >> shift]
   1168        idx = index2[(idx << shift) + (char & ((1 << shift) - 1))]
   1169 
   1170        assert test == table[idx]
   1171 
   1172    # verify correctness
   1173    for char in folding_index:
   1174        test = folding_table[folding_index[char]]
   1175 
   1176        idx = folding_index1[char >> folding_shift]
   1177        idx = folding_index2[
   1178            (idx << folding_shift) + (char & ((1 << folding_shift) - 1))
   1179        ]
   1180 
   1181        assert test == folding_table[idx]
   1182 
   1183    comment = """
   1184 /*
   1185 * So how does indexing work?
   1186 * First let's have a look at a char16_t, 16-bits:
   1187 *              [................]
   1188 * Step 1:
   1189 *  Extracting the upper 11 bits from the char16_t.
   1190 *   upper = char >>  5 ([***********.....])
   1191 * Step 2:
   1192 *  Using these bits to get an reduced index from index1.
   1193 *   index = index1[upper]
   1194 * Step 3:
   1195 *  Combining the index and the bottom 5 bits of the original char16_t.
   1196 *   real_index = index2[(index << 5) + (char & ((1 << 5) - 1))] ([...********+++++])
   1197 *
   1198 * The advantage here is that the biggest number in index1 doesn't need 10 bits,
   1199 * but 7 and we save some memory.
   1200 *
   1201 * Step 4:
   1202 *  Get the character informations by looking up real_index in js_charinfo.
   1203 *
   1204 * Pseudocode of generation:
   1205 *
   1206 * let table be the mapping of char16_t => js_charinfo_index
   1207 * let index1 be an empty array
   1208 * let index2 be an empty array
   1209 * let cache be a hash map
   1210 *
   1211 * while shift is less then maximal amount you can shift 0xffff before it's 0
   1212 *  let chunks be table split in chunks of size 2**shift
   1213 *
   1214 *  for every chunk in chunks
   1215 *   if chunk is in cache
   1216 *    let index be cache[chunk]
   1217 *   else
   1218 *    let index be the max key of index2 + 1
   1219 *    for element in chunk
   1220 *     push element to index2
   1221 *    put index as chunk in cache
   1222 *
   1223 *   push index >> shift to index1
   1224 *
   1225 *  increase shift
   1226 *  stop if you found the best shift
   1227 */
   1228 """
   1229 
   1230    def dump(data, name, println):
   1231        println(f"const uint8_t unicode::{name}[] = {{")
   1232 
   1233        line = pad = " " * 4
   1234        lines = []
   1235        for entry in data:
   1236            assert entry < 256
   1237            s = str(entry)
   1238            s = s.rjust(3)
   1239 
   1240            if len(line + s) + 5 > 99:
   1241                lines.append(line.rstrip())
   1242                line = pad + s + ", "
   1243            else:
   1244                line = line + s + ", "
   1245        lines.append(line.rstrip())
   1246 
   1247        println("\n".join(lines))
   1248        println("};")
   1249 
   1250    def write_table(data_type, name, tbl, idx1_name, idx1, idx2_name, idx2, println):
   1251        println(f"const {data_type} unicode::{name}[] = {{")
   1252        for d in tbl:
   1253            println("    {{ {} }},".format(", ".join(str(e) for e in d)))
   1254        println("};")
   1255        println("")
   1256 
   1257        dump(idx1, idx1_name, println)
   1258        println("")
   1259        dump(idx2, idx2_name, println)
   1260        println("")
   1261 
   1262    def write_supplemental_identifier_method(name, group_set, println):
   1263        println("bool")
   1264        println(f"js::unicode::{name}(char32_t codePoint)")
   1265        println("{")
   1266        for from_code, to_code in int_ranges(group_set.keys()):
   1267            println(
   1268                f"    if (codePoint >= 0x{from_code:X} && codePoint <= 0x{to_code:X}) {{ // {codepoint_table.name(from_code)} .. {codepoint_table.name(to_code)}"
   1269            )
   1270            println("        return true;")
   1271            println("    }")
   1272        println("    return false;")
   1273        println("}")
   1274        println("")
   1275 
   1276    file_name = "Unicode.cpp"
   1277    with open(file_name, "w", encoding="utf-8") as data_file:
   1278        write = partial(print, file=data_file, sep="", end="")
   1279        println = partial(print, file=data_file, sep="", end="\n")
   1280 
   1281        write(warning_message)
   1282        write(unicode_version_message.format(version))
   1283        write(public_domain)
   1284        println('#include "util/Unicode.h"')
   1285        println("")
   1286        println("using namespace js;")
   1287        println("using namespace js::unicode;")
   1288        write(comment)
   1289 
   1290        write_table(
   1291            "CharacterInfo",
   1292            "js_charinfo",
   1293            table,
   1294            "index1",
   1295            index1,
   1296            "index2",
   1297            index2,
   1298            println,
   1299        )
   1300 
   1301        write_table(
   1302            "FoldingInfo",
   1303            "js_foldinfo",
   1304            folding_table,
   1305            "folding_index1",
   1306            folding_index1,
   1307            "folding_index2",
   1308            folding_index2,
   1309            println,
   1310        )
   1311 
   1312        # If the following assert fails, it means space character is added to
   1313        # non-BMP area.  In that case the following code should be uncommented
   1314        # and the corresponding code should be added to frontend.  (At least
   1315        # unicode::IsSpace will require updating to handle this.)
   1316        assert len(non_bmp_space_set.keys()) == 0
   1317 
   1318        write_supplemental_identifier_method(
   1319            "IsIdentifierStartNonBMP", non_bmp_id_start_set, println
   1320        )
   1321 
   1322        write_supplemental_identifier_method(
   1323            "IsIdentifierPartNonBMP", non_bmp_id_cont_set, println
   1324        )
   1325 
   1326        write_special_casing_methods(unconditional_toupper, codepoint_table, println)
   1327 
   1328        write_ascii_lookup_tables(table, index, write, println)
   1329 
   1330        write_latin1_lookup_tables(table, index, write, println)
   1331 
   1332 
   1333 def getsize(data):
   1334    """return smallest possible integer size for the given array"""
   1335    maxdata = max(data)
   1336    assert maxdata < 2**32
   1337 
   1338    if maxdata < 256:
   1339        return 1
   1340    elif maxdata < 65536:
   1341        return 2
   1342    else:
   1343        return 4
   1344 
   1345 
   1346 def splitbins(t):
   1347    """t -> (t1, t2, shift).  Split a table to save space.
   1348 
   1349    t is a sequence of ints.  This function can be useful to save space if
   1350    many of the ints are the same.  t1 and t2 are lists of ints, and shift
   1351    is an int, chosen to minimize the combined size of t1 and t2 (in C
   1352    code), and where for each i in range(len(t)),
   1353        t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
   1354    where mask is a bitmask isolating the last "shift" bits.
   1355    """
   1356 
   1357    def dump(t1, t2, shift, bytes):
   1358        print(
   1359            "%d+%d bins at shift %d; %d bytes" % (len(t1), len(t2), shift, bytes),
   1360            file=sys.stderr,
   1361        )
   1362        print("Size of original table:", len(t) * getsize(t), "bytes", file=sys.stderr)
   1363 
   1364    n = len(t) - 1  # last valid index
   1365    maxshift = 0  # the most we can shift n and still have something left
   1366    if n > 0:
   1367        while n >> 1:
   1368            n >>= 1
   1369            maxshift += 1
   1370    del n
   1371    bytes = sys.maxsize  # smallest total size so far
   1372    t = tuple(t)  # so slices can be dict keys
   1373    for shift in range(maxshift + 1):
   1374        t1 = []
   1375        t2 = []
   1376        size = 2**shift
   1377        bincache = {}
   1378 
   1379        for i in range(0, len(t), size):
   1380            bin = t[i : i + size]
   1381 
   1382            index = bincache.get(bin)
   1383            if index is None:
   1384                index = len(t2)
   1385                bincache[bin] = index
   1386                t2.extend(bin)
   1387            t1.append(index >> shift)
   1388 
   1389        # determine memory size
   1390        b = len(t1) * getsize(t1) + len(t2) * getsize(t2)
   1391        if b < bytes:
   1392            best = t1, t2, shift
   1393            bytes = b
   1394    t1, t2, shift = best
   1395 
   1396    print("Best:", end=" ", file=sys.stderr)
   1397    dump(t1, t2, shift, bytes)
   1398 
   1399    # exhaustively verify that the decomposition is correct
   1400    mask = 2**shift - 1
   1401    for i in range(len(t)):
   1402        assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
   1403    return best
   1404 
   1405 
   1406 def update_unicode(args):
   1407    base_path = os.getcwd()
   1408 
   1409    version = args.version
   1410    if version is not None:
   1411        baseurl = "https://unicode.org/Public"
   1412        if version == "UNIDATA":
   1413            url = "%s/%s" % (baseurl, version)
   1414        else:
   1415            url = "%s/%s/ucd" % (baseurl, version)
   1416 
   1417    print("Arguments:")
   1418    if version is not None:
   1419        print("\tVersion: %s" % version)
   1420        print("\tDownload url: %s" % url)
   1421 
   1422        request_url = f"{url}/UCD.zip"
   1423        with closing(urlopen(request_url)) as downloaded_file:
   1424            downloaded_data = io.BytesIO(downloaded_file.read())
   1425 
   1426        with ZipFile(downloaded_data) as zip_file:
   1427            for fname in [
   1428                "UnicodeData.txt",
   1429                "CaseFolding.txt",
   1430                "DerivedCoreProperties.txt",
   1431                "SpecialCasing.txt",
   1432            ]:
   1433                zip_file.extract(fname, path=base_path)
   1434    else:
   1435        print("\tUsing local files.")
   1436        print("\tAlways make sure you have the newest Unicode files!")
   1437    print("")
   1438 
   1439    def version_from_file(f, fname):
   1440        pat_version = re.compile(r"# %s-(?P<version>\d+\.\d+\.\d+).txt" % fname)
   1441        return pat_version.match(f.readline()).group("version")
   1442 
   1443    with open(
   1444        os.path.join(base_path, "UnicodeData.txt"), encoding="utf-8"
   1445    ) as unicode_data, open(
   1446        os.path.join(base_path, "CaseFolding.txt"), encoding="utf-8"
   1447    ) as case_folding, open(
   1448        os.path.join(base_path, "DerivedCoreProperties.txt"), encoding="utf-8"
   1449    ) as derived_core_properties, open(
   1450        os.path.join(base_path, "SpecialCasing.txt"), encoding="utf-8"
   1451    ) as special_casing:
   1452        unicode_version = version_from_file(
   1453            derived_core_properties, "DerivedCoreProperties"
   1454        )
   1455 
   1456        print("Processing...")
   1457        (
   1458            table,
   1459            index,
   1460            non_bmp_lower_map,
   1461            non_bmp_upper_map,
   1462            non_bmp_space_set,
   1463            non_bmp_id_start_set,
   1464            non_bmp_id_cont_set,
   1465            codepoint_table,
   1466            test_space_table,
   1467        ) = process_unicode_data(unicode_data, derived_core_properties)
   1468        (folding_table, folding_index, folding_tests) = process_case_folding(
   1469            case_folding
   1470        )
   1471        (unconditional_tolower, unconditional_toupper) = process_special_casing(
   1472            special_casing, table, index
   1473        )
   1474 
   1475    print("Generating...")
   1476    make_unicode_file(
   1477        unicode_version,
   1478        table,
   1479        index,
   1480        folding_table,
   1481        folding_index,
   1482        non_bmp_space_set,
   1483        non_bmp_id_start_set,
   1484        non_bmp_id_cont_set,
   1485        unconditional_toupper,
   1486        codepoint_table,
   1487    )
   1488    make_non_bmp_file(
   1489        unicode_version, non_bmp_lower_map, non_bmp_upper_map, codepoint_table
   1490    )
   1491 
   1492    make_bmp_mapping_test(
   1493        unicode_version, codepoint_table, unconditional_tolower, unconditional_toupper
   1494    )
   1495    make_non_bmp_mapping_test(
   1496        unicode_version, non_bmp_upper_map, non_bmp_lower_map, codepoint_table
   1497    )
   1498    make_space_test(unicode_version, test_space_table, codepoint_table)
   1499    make_regexp_space_test(unicode_version, test_space_table, codepoint_table)
   1500    make_icase_test(unicode_version, folding_tests, codepoint_table)
   1501 
   1502 
   1503 if __name__ == "__main__":
   1504    import argparse
   1505 
   1506    # This script must be run from js/src/util to work correctly.
   1507    if "/".join(os.path.normpath(os.getcwd()).split(os.sep)[-3:]) != "js/src/util":
   1508        raise RuntimeError("%s must be run from js/src/util" % sys.argv[0])
   1509 
   1510    parser = argparse.ArgumentParser(description="Update Unicode data.")
   1511 
   1512    parser.add_argument(
   1513        "--version",
   1514        help='Optional Unicode version number. If specified, downloads the\
   1515                              selected version from <https://unicode.org/Public>. If not specified\
   1516                              uses the existing local files to generate the Unicode data. The\
   1517                              number must match a published Unicode version, e.g. use\
   1518                              "--version=8.0.0" to download Unicode 8 files. Alternatively use\
   1519                              "--version=UNIDATA" to download the latest published version.',
   1520    )
   1521 
   1522    parser.set_defaults(func=update_unicode)
   1523 
   1524    args = parser.parse_args()
   1525    args.func(args)