tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

gen-ucd-table.py (7707B)


      1 #!/usr/bin/env python3
      2 
      3 """usage: ./gen-ucd-table [--rust] ucd.nounihan.grouped.xml [/path/to/hb-script-list.h]
      4 
      5 Input file:
      6 * https://unicode.org/Public/UCD/latest/ucdxml/ucd.nounihan.grouped.zip
      7 """
      8 
      9 # https://github.com/harfbuzz/packtab
     10 import packTab
     11 import packTab.ucdxml
     12 
     13 import sys, re
     14 import logging
     15 
     16 logging.basicConfig(format="%(levelname)s: %(message)s", level=logging.INFO)
     17 
     18 if len(sys.argv) > 1 and sys.argv[1] == "--rust":
     19    del sys.argv[1]
     20    logging.info("Generating Rust code...")
     21    language = "rust"
     22 else:
     23    logging.info("Generating C code...")
     24    language = "c"
     25 language = packTab.languages[language]
     26 
     27 if len(sys.argv) not in (2, 3):
     28    sys.exit(__doc__)
     29 
     30 logging.info("Loading UCDXML...")
     31 ucdxml = packTab.ucdxml.load_ucdxml(sys.argv[1])
     32 ucd = packTab.ucdxml.ucdxml_get_repertoire(ucdxml)
     33 
     34 hb_script_list_h = "hb-script-list.h" if len(sys.argv) < 3 else sys.argv[2]
     35 
     36 logging.info("Preparing data tables...")
     37 
     38 
     39 # This is how the data is encoded:
     40 #
     41 # General_Category (gc), Canonical_Combining_Class (ccc),
     42 # and Script (sc) are encoded as integers.
     43 #
     44 # Mirroring character (bmg) is encoded as difference from
     45 # the original character.
     46 #
     47 # Composition & Decomposition (dm) are encoded elaborately,
     48 # as discussed below.
     49 
     50 gc = [u["gc"] for u in ucd]
     51 ccc = [int(u["ccc"]) for u in ucd]
     52 bmg = [int(v, 16) - int(u) if v else 0 for u, v in enumerate(u["bmg"] for u in ucd)]
     53 sc = [u["sc"] for u in ucd]
     54 
     55 
     56 # Prepare Compose / Decompose data
     57 #
     58 # This code is very dense.  See hb_ucd_compose() / hb_ucd_decompose() for the logic.
     59 
     60 dm = {
     61    i: tuple(int(v, 16) for v in u["dm"].split())
     62    for i, u in enumerate(ucd)
     63    if u["dm"] != "#" and u["dt"] == "can" and not (0xAC00 <= i < 0xAC00 + 11172)
     64 }
     65 ce = {i for i, u in enumerate(ucd) if u["Comp_Ex"] == "Y"}
     66 
     67 assert not any(v for v in dm.values() if len(v) not in (1, 2))
     68 dm1 = sorted(set(v for v in dm.values() if len(v) == 1))
     69 assert all((v[0] >> 16) in (0, 2) for v in dm1)
     70 dm1_p0_array = ["0x%04X" % (v[0] & 0xFFFF) for v in dm1 if (v[0] >> 16) == 0]
     71 dm1_p2_array = ["0x%04X" % (v[0] & 0xFFFF) for v in dm1 if (v[0] >> 16) == 2]
     72 dm1_order = {v: i + 1 for i, v in enumerate(dm1)}
     73 
     74 dm2 = sorted(
     75    (v + (i if i not in ce and not ccc[i] else 0,), v)
     76    for i, v in dm.items()
     77    if len(v) == 2
     78 )
     79 
     80 filt = lambda v: (
     81    (v[0] & 0xFFFFF800) == 0x0000
     82    and (v[1] & 0xFFFFFF80) == 0x0300
     83    and (v[2] & 0xFFF0C000) == 0x0000
     84 )
     85 dm2_u32_array = [v for v in dm2 if filt(v[0])]
     86 dm2_u64_array = [v for v in dm2 if not filt(v[0])]
     87 assert dm2_u32_array + dm2_u64_array == dm2
     88 dm2_u32_array = [
     89    "HB_CODEPOINT_ENCODE3_11_7_14 (0x%04X, 0x%04X, 0x%04X)" % v[0]
     90    for v in dm2_u32_array
     91 ]
     92 dm2_u64_array = [
     93    "HB_CODEPOINT_ENCODE3 (0x%04X, 0x%04X, 0x%04X)" % v[0] for v in dm2_u64_array
     94 ]
     95 
     96 l = 1 + len(dm1_p0_array) + len(dm1_p2_array)
     97 dm2_order = {v[1]: i + l for i, v in enumerate(dm2)}
     98 
     99 dm_order = {None: 0}
    100 dm_order.update(dm1_order)
    101 dm_order.update(dm2_order)
    102 
    103 
    104 # Prepare General_Category / Script mapping arrays
    105 
    106 gc_order = dict()
    107 for i, v in enumerate(
    108    (
    109        "Cc",
    110        "Cf",
    111        "Cn",
    112        "Co",
    113        "Cs",
    114        "Ll",
    115        "Lm",
    116        "Lo",
    117        "Lt",
    118        "Lu",
    119        "Mc",
    120        "Me",
    121        "Mn",
    122        "Nd",
    123        "Nl",
    124        "No",
    125        "Pc",
    126        "Pd",
    127        "Pe",
    128        "Pf",
    129        "Pi",
    130        "Po",
    131        "Ps",
    132        "Sc",
    133        "Sk",
    134        "Sm",
    135        "So",
    136        "Zl",
    137        "Zp",
    138        "Zs",
    139    )
    140 ):
    141    gc_order[i] = v
    142    gc_order[v] = i
    143 
    144 sc_order = dict()
    145 sc_array = []
    146 sc_re = re.compile(r"\b(HB_SCRIPT_[_A-Z]*).*HB_TAG [(]'(.)','(.)','(.)','(.)'[)]")
    147 for line in open(hb_script_list_h):
    148    m = sc_re.search(line)
    149    if not m:
    150        continue
    151    name = m.group(1)
    152    tag = "".join(m.group(i) for i in range(2, 6))
    153    i = len(sc_array)
    154    sc_order[tag] = i
    155    sc_order[i] = tag
    156    if language.name == "rust":
    157        name = name.replace("HB_SCRIPT_", "script::")
    158    sc_array.append(name)
    159 
    160 
    161 # Write out main data
    162 
    163 DEFAULT = "DEFAULT"
    164 COMPACT = "COMPACT"
    165 SLOPPY = "SLOPPY"
    166 
    167 compression_level = {
    168    DEFAULT: 3,
    169    COMPACT: 9,
    170    SLOPPY: 9,
    171 }
    172 
    173 logging.info("Generating output...")
    174 print("/* == Start of generated table == */")
    175 print("/*")
    176 print(" * The following table is generated by running:")
    177 print(" *")
    178 print(
    179    " *   ./gen-ucd-table.py %sucd.nounihan.grouped.xml hb-script-list.h"
    180    % (("--%s " % language.name) if language.name != "c" else "")
    181 )
    182 print(" *")
    183 print(" * on file with this description:", ucdxml.description)
    184 print(" */")
    185 print()
    186 if language.name == "c":
    187    print("#ifndef HB_UCD_TABLE_HH")
    188    print("#define HB_UCD_TABLE_HH")
    189    print()
    190    print('#include "hb.hh"')
    191    print()
    192 elif language.name == "rust":
    193    print("pub(crate) mod ucd {")
    194    print()
    195    print("#![allow(unused_parens)]")
    196    print("#![allow(clippy::unnecessary_cast, clippy::unreadable_literal, clippy::double_parens)]")
    197    print()
    198    print("use crate::hb::algs::{HB_CODEPOINT_ENCODE3, HB_CODEPOINT_ENCODE3_11_7_14};")
    199    print("use crate::hb::common::script;")
    200    print("use crate::hb::common::Script as hb_script_t;")
    201    print()
    202 else:
    203    assert False, "Unknown language: %s" % language.name
    204 
    205 # Write mapping data
    206 
    207 uint16_t = language.type_name("u16")
    208 uint32_t = language.type_name("u32")
    209 uint64_t = language.type_name("u64")
    210 
    211 if language.name == "c":
    212    private = True
    213 elif language.name == "rust":
    214    private = False
    215 else:
    216    assert False, "Unknown language: %s" % language.name
    217 
    218 code = packTab.Code("_hb_ucd")
    219 sc_array, _ = code.addArray("hb_script_t", "sc_map", sc_array)
    220 dm1_p0_array, _ = code.addArray(uint16_t, "dm1_p0_map", dm1_p0_array)
    221 dm1_p2_array, _ = code.addArray(uint16_t, "dm1_p2_map", dm1_p2_array)
    222 dm2_u32_array, _ = code.addArray(uint32_t, "dm2_u32_map", dm2_u32_array)
    223 dm2_u64_array, _ = code.addArray(uint64_t, "dm2_u64_map", dm2_u64_array)
    224 code.print_code(language=language, private=private)
    225 
    226 datasets = [
    227    ("gc", gc, "Cn", gc_order),
    228    ("ccc", ccc, 0, None),
    229    ("bmg", bmg, 0, None),
    230    ("sc", sc, "Zzzz", sc_order),
    231    ("dm", dm, None, dm_order),
    232 ]
    233 
    234 
    235 # Write main data
    236 
    237 modes = {}
    238 if language.name == "c":
    239    modes[DEFAULT] = "#ifndef HB_OPTIMIZE_SIZE"
    240    modes[COMPACT] = "#elif !defined(HB_NO_UCD_UNASSIGNED)"
    241    modes[SLOPPY] = "#else"
    242    modes[None] = "#endif"
    243 else:
    244    modes[DEFAULT] = ""
    245 
    246 for step, text in modes.items():
    247    print()
    248    if text:
    249        print(text)
    250        print()
    251    if step is None:
    252        continue
    253 
    254    compression = compression_level[step]
    255    logging.info("  Compression=%d:" % compression)
    256 
    257    if step == SLOPPY:
    258        for i in range(len(gc)):
    259            if (i % 128) and gc[i] == "Cn":
    260                gc[i] = gc[i - 1]
    261        for i in range(len(gc) - 2, -1, -1):
    262            if ((i + 1) % 128) and gc[i] == "Cn":
    263                gc[i] = gc[i + 1]
    264        for i in range(len(sc)):
    265            if (i % 128) and sc[i] == "Zzzz":
    266                sc[i] = sc[i - 1]
    267        for i in range(len(sc) - 2, -1, -1):
    268            if ((i + 1) % 128) and sc[i] == "Zzzz":
    269                sc[i] = sc[i + 1]
    270 
    271    code = packTab.Code("_hb_ucd")
    272 
    273    for name, data, default, mapping in datasets:
    274        sol = packTab.pack_table(
    275            data, default, mapping=mapping, compression=compression
    276        )
    277        logging.info("      Dataset=%-8s FullCost=%d" % (name, sol.fullCost))
    278        sol.genCode(code, name, private=private, language=language)
    279 
    280    code.print_code(language=language)
    281 
    282    print()
    283 
    284 if language.name == "c":
    285    print("#endif /* HB_UCD_TABLE_HH */")
    286 elif language.name == "rust":
    287    print("}")
    288 else:
    289    assert False, "Unknown language: %s" % language.name
    290 print()
    291 print("/* == End of generated table == */")
    292 logging.info("Done.")