tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

base_chars.py (5530B)


      1 # This Source Code Form is subject to the terms of the Mozilla Public
      2 # License, v. 2.0. If a copy of the MPL was not distributed with this
      3 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
      4 
      5 import re
      6 from collections import namedtuple
      7 from unicodedata import category, combining, normalize
      8 
      9 UNICODE_LIMIT = 0x110000
     10 
     11 UNICODE_COMBINING_CLASS_NOT_REORDERED = 0
     12 UNICODE_COMBINING_CLASS_KANA_VOICING = 8
     13 UNICODE_COMBINING_CLASS_VIRAMA = 9
     14 
     15 BaseCharMapping = namedtuple("BaseCharMapping", ("char", "base_char"))
     16 BaseCharMappingBlock = namedtuple("BaseCharMappingBlock", ("first", "last", "offset"))
     17 
     18 
     19 # Keep this function in sync with IsCombiningDiacritic in nsUnicodeProperties.h.
     20 def is_combining_diacritic(char):
     21    return combining(char) not in (
     22        UNICODE_COMBINING_CLASS_NOT_REORDERED,
     23        UNICODE_COMBINING_CLASS_KANA_VOICING,
     24        UNICODE_COMBINING_CLASS_VIRAMA,
     25        91,
     26        129,
     27        130,
     28        132,
     29    )
     30 
     31 
     32 # Keep this function in sync with IsMathOrMusicSymbol in nsUnicodeProperties.h.
     33 def is_math_or_music_symbol(char):
     34    return category(char) in ("Sm", "So")
     35 
     36 
     37 def changes_plane(char, base_char):
     38    # Mappings that would change the first 16 bits of a character are not
     39    # currently supported. This is because the mapping table only records the
     40    # last 16 bits of the base character and also because moving into or out of
     41    # the basic multilingual plane would change the length of a UTF-16 string.
     42    return ord(char) >> 16 != ord(base_char) >> 16
     43 
     44 
     45 def main(header, fallback_table):
     46    mappings = {}
     47 
     48    # Glean mappings from decompositions
     49 
     50    for char in range(UNICODE_LIMIT):
     51        char = chr(char)
     52        if is_combining_diacritic(char) or is_math_or_music_symbol(char):
     53            continue
     54        decomposition = normalize("NFD", char)
     55        if len(decomposition) < 2:
     56            continue
     57        base_char = decomposition[0]
     58        if changes_plane(char, base_char):
     59            continue
     60        next_char = decomposition[1]
     61        if not is_combining_diacritic(next_char):
     62            # Hangul syllables decompose but do not actually have diacritics.
     63            # This also excludes decompositions with the Japanese marks U+3099
     64            # and U+309A (COMBINING KATAKANA-HIRAGANA [SEMI-]VOICED SOUND
     65            # MARK), which we should not ignore for searching (bug 1624244).
     66            continue
     67        mappings[char] = base_char
     68 
     69    # Add mappings from the ASCII fallback table
     70 
     71    for line in open(fallback_table, encoding="UTF-8"):
     72        m = re.match("^(.) → (.+?) ;", line)
     73        if not m:
     74            continue
     75        char = m.group(1)
     76        decomposition = m.group(2)
     77        if len(decomposition) >= 3:
     78            if decomposition.startswith("'") and decomposition.endswith("'"):
     79                decomposition = decomposition[1:-1]
     80        if len(decomposition) >= 2:
     81            if decomposition.startswith("\\"):
     82                decomposition = decomposition[1:]
     83        if len(decomposition) > 1:
     84            continue
     85        if changes_plane(char, decomposition):
     86            continue
     87        mappings[char] = decomposition
     88 
     89    # Organize mappings into contiguous blocks
     90 
     91    mappings = sorted([BaseCharMapping(ord(k), ord(v)) for k, v in mappings.items()])
     92    blocks = []
     93    i = 0
     94    while i < len(mappings) - 1:
     95        offset = i
     96        first = mappings[i].char & 0xFF
     97        while (
     98            i < len(mappings) - 1 and mappings[i].char >> 8 == mappings[i + 1].char >> 8
     99        ):
    100            while (
    101                i < len(mappings) - 1
    102                and mappings[i].char >> 8 == mappings[i + 1].char >> 8
    103                and mappings[i + 1].char - mappings[i].char > 1
    104            ):
    105                char = mappings[i].char + 1
    106                mappings.insert(i + 1, BaseCharMapping(char, char))
    107                i += 1
    108            i += 1
    109        last = mappings[i].char & 0xFF
    110        blocks.append(BaseCharMappingBlock(first, last, offset))
    111        i += 1
    112 
    113    indexes = []
    114    for i, block in enumerate(blocks):
    115        while len(indexes) < mappings[block.offset].char >> 8:
    116            indexes.append(255)
    117        indexes.append(i)
    118 
    119    # Write the mappings to a C header file
    120 
    121    header.write("struct BaseCharMappingBlock {\n")
    122    header.write("  uint8_t mFirst;\n")
    123    header.write("  uint8_t mLast;\n")
    124    header.write("  uint16_t mMappingStartOffset;\n")
    125    header.write("};\n")
    126    header.write("\n")
    127    header.write("static const uint16_t BASE_CHAR_MAPPING_LIST[] = {\n")
    128    for char, base_char in mappings:
    129        header.write(f"  /* {char:#06x}" + " */ " + f"{base_char & 0xFFFF:#06x}" + ",")
    130        if char != base_char:
    131            header.write(" /* " + chr(char) + " → " + chr(base_char) + " */")
    132        header.write("\n")
    133    header.write("};\n")
    134    header.write("\n")
    135    header.write(
    136        "static const struct BaseCharMappingBlock BASE_CHAR_MAPPING_BLOCKS[] = {\n"
    137    )
    138    for block in blocks:
    139        header.write(
    140            "  {"
    141            + f"{block.first:#04x}"
    142            + ", "
    143            + f"{block.last:#04x}"
    144            + ", "
    145            + str(block.offset).rjust(4)
    146            + "}, // "
    147            + f"{mappings[block.offset].char >> 8:#04x}"
    148            + "xx\n"
    149        )
    150    header.write("};\n")
    151    header.write("\n")
    152    header.write("static const uint8_t BASE_CHAR_MAPPING_BLOCK_INDEX[] = {\n")
    153    for i, index in enumerate(indexes):
    154        header.write("  " + str(index).rjust(3) + ", // " + f"{i:#04x}" + "xx\n")
    155    header.write("};\n")