gen-ucd-table.py (7707B)
1 #!/usr/bin/env python3 2 3 """usage: ./gen-ucd-table [--rust] ucd.nounihan.grouped.xml [/path/to/hb-script-list.h] 4 5 Input file: 6 * https://unicode.org/Public/UCD/latest/ucdxml/ucd.nounihan.grouped.zip 7 """ 8 9 # https://github.com/harfbuzz/packtab 10 import packTab 11 import packTab.ucdxml 12 13 import sys, re 14 import logging 15 16 logging.basicConfig(format="%(levelname)s: %(message)s", level=logging.INFO) 17 18 if len(sys.argv) > 1 and sys.argv[1] == "--rust": 19 del sys.argv[1] 20 logging.info("Generating Rust code...") 21 language = "rust" 22 else: 23 logging.info("Generating C code...") 24 language = "c" 25 language = packTab.languages[language] 26 27 if len(sys.argv) not in (2, 3): 28 sys.exit(__doc__) 29 30 logging.info("Loading UCDXML...") 31 ucdxml = packTab.ucdxml.load_ucdxml(sys.argv[1]) 32 ucd = packTab.ucdxml.ucdxml_get_repertoire(ucdxml) 33 34 hb_script_list_h = "hb-script-list.h" if len(sys.argv) < 3 else sys.argv[2] 35 36 logging.info("Preparing data tables...") 37 38 39 # This is how the data is encoded: 40 # 41 # General_Category (gc), Canonical_Combining_Class (ccc), 42 # and Script (sc) are encoded as integers. 43 # 44 # Mirroring character (bmg) is encoded as difference from 45 # the original character. 46 # 47 # Composition & Decomposition (dm) are encoded elaborately, 48 # as discussed below. 49 50 gc = [u["gc"] for u in ucd] 51 ccc = [int(u["ccc"]) for u in ucd] 52 bmg = [int(v, 16) - int(u) if v else 0 for u, v in enumerate(u["bmg"] for u in ucd)] 53 sc = [u["sc"] for u in ucd] 54 55 56 # Prepare Compose / Decompose data 57 # 58 # This code is very dense. See hb_ucd_compose() / hb_ucd_decompose() for the logic. 59 60 dm = { 61 i: tuple(int(v, 16) for v in u["dm"].split()) 62 for i, u in enumerate(ucd) 63 if u["dm"] != "#" and u["dt"] == "can" and not (0xAC00 <= i < 0xAC00 + 11172) 64 } 65 ce = {i for i, u in enumerate(ucd) if u["Comp_Ex"] == "Y"} 66 67 assert not any(v for v in dm.values() if len(v) not in (1, 2)) 68 dm1 = sorted(set(v for v in dm.values() if len(v) == 1)) 69 assert all((v[0] >> 16) in (0, 2) for v in dm1) 70 dm1_p0_array = ["0x%04X" % (v[0] & 0xFFFF) for v in dm1 if (v[0] >> 16) == 0] 71 dm1_p2_array = ["0x%04X" % (v[0] & 0xFFFF) for v in dm1 if (v[0] >> 16) == 2] 72 dm1_order = {v: i + 1 for i, v in enumerate(dm1)} 73 74 dm2 = sorted( 75 (v + (i if i not in ce and not ccc[i] else 0,), v) 76 for i, v in dm.items() 77 if len(v) == 2 78 ) 79 80 filt = lambda v: ( 81 (v[0] & 0xFFFFF800) == 0x0000 82 and (v[1] & 0xFFFFFF80) == 0x0300 83 and (v[2] & 0xFFF0C000) == 0x0000 84 ) 85 dm2_u32_array = [v for v in dm2 if filt(v[0])] 86 dm2_u64_array = [v for v in dm2 if not filt(v[0])] 87 assert dm2_u32_array + dm2_u64_array == dm2 88 dm2_u32_array = [ 89 "HB_CODEPOINT_ENCODE3_11_7_14 (0x%04X, 0x%04X, 0x%04X)" % v[0] 90 for v in dm2_u32_array 91 ] 92 dm2_u64_array = [ 93 "HB_CODEPOINT_ENCODE3 (0x%04X, 0x%04X, 0x%04X)" % v[0] for v in dm2_u64_array 94 ] 95 96 l = 1 + len(dm1_p0_array) + len(dm1_p2_array) 97 dm2_order = {v[1]: i + l for i, v in enumerate(dm2)} 98 99 dm_order = {None: 0} 100 dm_order.update(dm1_order) 101 dm_order.update(dm2_order) 102 103 104 # Prepare General_Category / Script mapping arrays 105 106 gc_order = dict() 107 for i, v in enumerate( 108 ( 109 "Cc", 110 "Cf", 111 "Cn", 112 "Co", 113 "Cs", 114 "Ll", 115 "Lm", 116 "Lo", 117 "Lt", 118 "Lu", 119 "Mc", 120 "Me", 121 "Mn", 122 "Nd", 123 "Nl", 124 "No", 125 "Pc", 126 "Pd", 127 "Pe", 128 "Pf", 129 "Pi", 130 "Po", 131 "Ps", 132 "Sc", 133 "Sk", 134 "Sm", 135 "So", 136 "Zl", 137 "Zp", 138 "Zs", 139 ) 140 ): 141 gc_order[i] = v 142 gc_order[v] = i 143 144 sc_order = dict() 145 sc_array = [] 146 sc_re = re.compile(r"\b(HB_SCRIPT_[_A-Z]*).*HB_TAG [(]'(.)','(.)','(.)','(.)'[)]") 147 for line in open(hb_script_list_h): 148 m = sc_re.search(line) 149 if not m: 150 continue 151 name = m.group(1) 152 tag = "".join(m.group(i) for i in range(2, 6)) 153 i = len(sc_array) 154 sc_order[tag] = i 155 sc_order[i] = tag 156 if language.name == "rust": 157 name = name.replace("HB_SCRIPT_", "script::") 158 sc_array.append(name) 159 160 161 # Write out main data 162 163 DEFAULT = "DEFAULT" 164 COMPACT = "COMPACT" 165 SLOPPY = "SLOPPY" 166 167 compression_level = { 168 DEFAULT: 3, 169 COMPACT: 9, 170 SLOPPY: 9, 171 } 172 173 logging.info("Generating output...") 174 print("/* == Start of generated table == */") 175 print("/*") 176 print(" * The following table is generated by running:") 177 print(" *") 178 print( 179 " * ./gen-ucd-table.py %sucd.nounihan.grouped.xml hb-script-list.h" 180 % (("--%s " % language.name) if language.name != "c" else "") 181 ) 182 print(" *") 183 print(" * on file with this description:", ucdxml.description) 184 print(" */") 185 print() 186 if language.name == "c": 187 print("#ifndef HB_UCD_TABLE_HH") 188 print("#define HB_UCD_TABLE_HH") 189 print() 190 print('#include "hb.hh"') 191 print() 192 elif language.name == "rust": 193 print("pub(crate) mod ucd {") 194 print() 195 print("#![allow(unused_parens)]") 196 print("#![allow(clippy::unnecessary_cast, clippy::unreadable_literal, clippy::double_parens)]") 197 print() 198 print("use crate::hb::algs::{HB_CODEPOINT_ENCODE3, HB_CODEPOINT_ENCODE3_11_7_14};") 199 print("use crate::hb::common::script;") 200 print("use crate::hb::common::Script as hb_script_t;") 201 print() 202 else: 203 assert False, "Unknown language: %s" % language.name 204 205 # Write mapping data 206 207 uint16_t = language.type_name("u16") 208 uint32_t = language.type_name("u32") 209 uint64_t = language.type_name("u64") 210 211 if language.name == "c": 212 private = True 213 elif language.name == "rust": 214 private = False 215 else: 216 assert False, "Unknown language: %s" % language.name 217 218 code = packTab.Code("_hb_ucd") 219 sc_array, _ = code.addArray("hb_script_t", "sc_map", sc_array) 220 dm1_p0_array, _ = code.addArray(uint16_t, "dm1_p0_map", dm1_p0_array) 221 dm1_p2_array, _ = code.addArray(uint16_t, "dm1_p2_map", dm1_p2_array) 222 dm2_u32_array, _ = code.addArray(uint32_t, "dm2_u32_map", dm2_u32_array) 223 dm2_u64_array, _ = code.addArray(uint64_t, "dm2_u64_map", dm2_u64_array) 224 code.print_code(language=language, private=private) 225 226 datasets = [ 227 ("gc", gc, "Cn", gc_order), 228 ("ccc", ccc, 0, None), 229 ("bmg", bmg, 0, None), 230 ("sc", sc, "Zzzz", sc_order), 231 ("dm", dm, None, dm_order), 232 ] 233 234 235 # Write main data 236 237 modes = {} 238 if language.name == "c": 239 modes[DEFAULT] = "#ifndef HB_OPTIMIZE_SIZE" 240 modes[COMPACT] = "#elif !defined(HB_NO_UCD_UNASSIGNED)" 241 modes[SLOPPY] = "#else" 242 modes[None] = "#endif" 243 else: 244 modes[DEFAULT] = "" 245 246 for step, text in modes.items(): 247 print() 248 if text: 249 print(text) 250 print() 251 if step is None: 252 continue 253 254 compression = compression_level[step] 255 logging.info(" Compression=%d:" % compression) 256 257 if step == SLOPPY: 258 for i in range(len(gc)): 259 if (i % 128) and gc[i] == "Cn": 260 gc[i] = gc[i - 1] 261 for i in range(len(gc) - 2, -1, -1): 262 if ((i + 1) % 128) and gc[i] == "Cn": 263 gc[i] = gc[i + 1] 264 for i in range(len(sc)): 265 if (i % 128) and sc[i] == "Zzzz": 266 sc[i] = sc[i - 1] 267 for i in range(len(sc) - 2, -1, -1): 268 if ((i + 1) % 128) and sc[i] == "Zzzz": 269 sc[i] = sc[i + 1] 270 271 code = packTab.Code("_hb_ucd") 272 273 for name, data, default, mapping in datasets: 274 sol = packTab.pack_table( 275 data, default, mapping=mapping, compression=compression 276 ) 277 logging.info(" Dataset=%-8s FullCost=%d" % (name, sol.fullCost)) 278 sol.genCode(code, name, private=private, language=language) 279 280 code.print_code(language=language) 281 282 print() 283 284 if language.name == "c": 285 print("#endif /* HB_UCD_TABLE_HH */") 286 elif language.name == "rust": 287 print("}") 288 else: 289 assert False, "Unknown language: %s" % language.name 290 print() 291 print("/* == End of generated table == */") 292 logging.info("Done.")