gen-use-table.py (15145B)
1 #!/usr/bin/env python3 2 # flake8: noqa: F821 3 4 """usage: ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt ArabicShaping.txt DerivedCoreProperties.txt UnicodeData.txt Blocks.txt Scripts.txt IndicSyllabicCategory-Additional.txt IndicPositionalCategory-Additional.txt 5 6 Input files: 7 * https://unicode.org/Public/UCD/latest/ucd/IndicSyllabicCategory.txt 8 * https://unicode.org/Public/UCD/latest/ucd/IndicPositionalCategory.txt 9 * https://unicode.org/Public/UCD/latest/ucd/ArabicShaping.txt 10 * https://unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt 11 * https://unicode.org/Public/UCD/latest/ucd/UnicodeData.txt 12 * https://unicode.org/Public/UCD/latest/ucd/Blocks.txt 13 * https://unicode.org/Public/UCD/latest/ucd/Scripts.txt 14 * ms-use/IndicSyllabicCategory-Additional.txt 15 * ms-use/IndicPositionalCategory-Additional.txt 16 """ 17 18 import packTab 19 20 import sys 21 import logging 22 23 logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO) 24 25 if len(sys.argv) > 1 and sys.argv[1] == "--rust": 26 del sys.argv[1] 27 logging.info("Generating Rust code...") 28 language = "rust" 29 else: 30 logging.info("Generating C code...") 31 language = "c" 32 language = packTab.languages[language] 33 34 import sys 35 36 if len (sys.argv) != 10: 37 sys.exit (__doc__) 38 39 DISABLED_SCRIPTS = { 40 'Arabic', 41 'Lao', 42 'Samaritan', 43 'Syriac', 44 'Thai', 45 } 46 47 files = [open (x, encoding='utf-8') for x in sys.argv[1:]] 48 49 headers = [[f.readline () for i in range (2)] for j,f in enumerate(files) if j != 4] 50 for j in range(7, 9): 51 for line in files[j]: 52 line = line.rstrip() 53 if not line: 54 break 55 headers[j - 1].append(line) 56 headers.append (["UnicodeData.txt does not have a header."]) 57 58 unicode_data = [{} for _ in files] 59 values = [{} for _ in files] 60 for i, f in enumerate (files): 61 for line in f: 62 63 j = line.find ('#') 64 if j >= 0: 65 line = line[:j] 66 67 fields = [x.strip () for x in line.split (';')] 68 if len (fields) == 1: 69 continue 70 71 uu = fields[0].split ('..') 72 start = int (uu[0], 16) 73 if len (uu) == 1: 74 end = start 75 else: 76 end = int (uu[1], 16) 77 78 t = fields[1 if i not in [2, 4] else 2] 79 80 if i == 2: 81 t = 'jt_' + t 82 elif i == 3 and t != 'Default_Ignorable_Code_Point': 83 continue 84 elif i == 7 and t == 'Consonant_Final_Modifier': 85 # TODO: https://github.com/MicrosoftDocs/typography-issues/issues/336 86 t = 'Syllable_Modifier' 87 elif i == 8 and t == 'NA': 88 t = 'Not_Applicable' 89 90 i0 = i if i < 7 else i - 7 91 for u in range (start, end + 1): 92 unicode_data[i0][u] = t 93 values[i0][t] = values[i0].get (t, 0) + end - start + 1 94 95 defaults = ('Other', 'Not_Applicable', 'jt_X', '', 'Cn', 'No_Block', 'Unknown') 96 97 # Merge data into one dict: 98 for i,v in enumerate (defaults): 99 values[i][v] = values[i].get (v, 0) + 1 100 combined = {} 101 for i,d in enumerate (unicode_data): 102 for u,v in d.items (): 103 if not u in combined: 104 if i >= 4: 105 continue 106 combined[u] = list (defaults) 107 combined[u][i] = v 108 combined = {k: v for k, v in combined.items() if v[6] not in DISABLED_SCRIPTS} 109 110 111 property_names = [ 112 # General_Category 113 'Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', 'Mc', 114 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 115 'Ps', 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs', 116 # Indic_Syllabic_Category 117 'Other', 118 'Bindu', 119 'Visarga', 120 'Avagraha', 121 'Nukta', 122 'Virama', 123 'Pure_Killer', 124 'Reordering_Killer', 125 'Invisible_Stacker', 126 'Vowel_Independent', 127 'Vowel_Dependent', 128 'Vowel', 129 'Consonant_Placeholder', 130 'Consonant', 131 'Consonant_Dead', 132 'Consonant_With_Stacker', 133 'Consonant_Prefixed', 134 'Consonant_Preceding_Repha', 135 'Consonant_Succeeding_Repha', 136 'Consonant_Subjoined', 137 'Consonant_Medial', 138 'Consonant_Final', 139 'Consonant_Head_Letter', 140 'Consonant_Initial_Postfixed', 141 'Modifying_Letter', 142 'Tone_Letter', 143 'Tone_Mark', 144 'Gemination_Mark', 145 'Cantillation_Mark', 146 'Register_Shifter', 147 'Syllable_Modifier', 148 'Consonant_Killer', 149 'Non_Joiner', 150 'Joiner', 151 'Number_Joiner', 152 'Number', 153 'Brahmi_Joining_Number', 154 'Symbol_Modifier', 155 'Hieroglyph', 156 'Hieroglyph_Joiner', 157 'Hieroglyph_Mark_Begin', 158 'Hieroglyph_Mark_End', 159 'Hieroglyph_Mirror', 160 'Hieroglyph_Modifier', 161 'Hieroglyph_Segment_Begin', 162 'Hieroglyph_Segment_End', 163 # Indic_Positional_Category 164 'Not_Applicable', 165 'Right', 166 'Left', 167 'Visual_Order_Left', 168 'Left_And_Right', 169 'Top', 170 'Bottom', 171 'Top_And_Bottom', 172 'Top_And_Bottom_And_Left', 173 'Top_And_Right', 174 'Top_And_Left', 175 'Top_And_Left_And_Right', 176 'Bottom_And_Left', 177 'Bottom_And_Right', 178 'Top_And_Bottom_And_Right', 179 'Overstruck', 180 # Joining_Type 181 'jt_C', 182 'jt_D', 183 'jt_L', 184 'jt_R', 185 'jt_T', 186 'jt_U', 187 'jt_X', 188 ] 189 190 class PropertyValue(object): 191 def __init__(self, name_): 192 self.name = name_ 193 def __str__(self): 194 return self.name 195 def __eq__(self, other): 196 return self.name == (other if isinstance(other, str) else other.name) 197 def __ne__(self, other): 198 return not (self == other) 199 def __hash__(self): 200 return hash(str(self)) 201 202 property_values = {} 203 204 for name in property_names: 205 value = PropertyValue(name) 206 assert value not in property_values 207 assert value not in globals() 208 property_values[name] = value 209 globals().update(property_values) 210 211 212 def is_BASE(U, UISC, UDI, UGC, AJT): 213 return (UISC in [Number, Consonant, Consonant_Head_Letter, 214 Tone_Letter, 215 Vowel_Independent, 216 ] or 217 # TODO: https://github.com/MicrosoftDocs/typography-issues/issues/484 218 AJT in [jt_C, jt_D, jt_L, jt_R] and UISC != Joiner or 219 (UGC == Lo and UISC in [Avagraha, Bindu, Consonant_Final, Consonant_Medial, 220 Consonant_Subjoined, Vowel, Vowel_Dependent])) 221 def is_BASE_NUM(U, UISC, UDI, UGC, AJT): 222 return UISC == Brahmi_Joining_Number 223 def is_BASE_OTHER(U, UISC, UDI, UGC, AJT): 224 if UISC == Consonant_Placeholder: return True 225 return U in [0x2015, 0x2022, 0x25FB, 0x25FC, 0x25FD, 0x25FE] 226 def is_CGJ(U, UISC, UDI, UGC, AJT): 227 # Also includes VARIATION_SELECTOR and ZWJ 228 return UISC == Joiner or UDI and UGC in [Mc, Me, Mn] 229 def is_CONS_FINAL(U, UISC, UDI, UGC, AJT): 230 return ((UISC == Consonant_Final and UGC != Lo) or 231 UISC == Consonant_Succeeding_Repha) 232 def is_CONS_FINAL_MOD(U, UISC, UDI, UGC, AJT): 233 return UISC == Syllable_Modifier 234 def is_CONS_MED(U, UISC, UDI, UGC, AJT): 235 # Consonant_Initial_Postfixed is new in Unicode 11; not in the spec. 236 return (UISC == Consonant_Medial and UGC != Lo or 237 UISC == Consonant_Initial_Postfixed) 238 def is_CONS_MOD(U, UISC, UDI, UGC, AJT): 239 return UISC in [Nukta, Gemination_Mark, Consonant_Killer] 240 def is_CONS_SUB(U, UISC, UDI, UGC, AJT): 241 return UISC == Consonant_Subjoined and UGC != Lo 242 def is_CONS_WITH_STACKER(U, UISC, UDI, UGC, AJT): 243 return UISC == Consonant_With_Stacker 244 def is_HALANT(U, UISC, UDI, UGC, AJT): 245 return UISC == Virama and not is_HALANT_OR_VOWEL_MODIFIER(U, UISC, UDI, UGC, AJT) 246 def is_HALANT_OR_VOWEL_MODIFIER(U, UISC, UDI, UGC, AJT): 247 # Split off of HALANT 248 return U == 0x0DCA 249 def is_HALANT_NUM(U, UISC, UDI, UGC, AJT): 250 return UISC == Number_Joiner 251 def is_HIEROGLYPH(U, UISC, UDI, UGC, AJT): 252 return UISC == Hieroglyph 253 def is_HIEROGLYPH_JOINER(U, UISC, UDI, UGC, AJT): 254 return UISC == Hieroglyph_Joiner 255 def is_HIEROGLYPH_MIRROR(U, UISC, UDI, UGC, AJT): 256 return UISC == Hieroglyph_Mirror 257 def is_HIEROGLYPH_MOD(U, UISC, UDI, UGC, AJT): 258 return UISC == Hieroglyph_Modifier 259 def is_HIEROGLYPH_SEGMENT_BEGIN(U, UISC, UDI, UGC, AJT): 260 return UISC in [Hieroglyph_Mark_Begin, Hieroglyph_Segment_Begin] 261 def is_HIEROGLYPH_SEGMENT_END(U, UISC, UDI, UGC, AJT): 262 return UISC in [Hieroglyph_Mark_End, Hieroglyph_Segment_End] 263 def is_INVISIBLE_STACKER(U, UISC, UDI, UGC, AJT): 264 # Split off of HALANT 265 return (UISC == Invisible_Stacker 266 and not is_SAKOT(U, UISC, UDI, UGC, AJT) 267 ) 268 def is_ZWNJ(U, UISC, UDI, UGC, AJT): 269 return UISC == Non_Joiner 270 def is_OTHER(U, UISC, UDI, UGC, AJT): 271 # Also includes BASE_IND and SYM 272 return ((UGC == Po or UISC in [Consonant_Dead, Joiner, Modifying_Letter, Other]) 273 and not is_BASE(U, UISC, UDI, UGC, AJT) 274 and not is_BASE_OTHER(U, UISC, UDI, UGC, AJT) 275 and not is_CGJ(U, UISC, UDI, UGC, AJT) 276 and not is_SYM_MOD(U, UISC, UDI, UGC, AJT) 277 and not is_Word_Joiner(U, UISC, UDI, UGC, AJT) 278 ) 279 def is_REORDERING_KILLER(U, UISC, UDI, UGC, AJT): 280 return UISC == Reordering_Killer 281 def is_REPHA(U, UISC, UDI, UGC, AJT): 282 return UISC in [Consonant_Preceding_Repha, Consonant_Prefixed] 283 def is_SAKOT(U, UISC, UDI, UGC, AJT): 284 # Split off of HALANT 285 return U == 0x1A60 286 def is_SYM_MOD(U, UISC, UDI, UGC, AJT): 287 return UISC == Symbol_Modifier 288 def is_VOWEL(U, UISC, UDI, UGC, AJT): 289 return (UISC == Pure_Killer or 290 UGC != Lo and UISC in [Vowel, Vowel_Dependent]) 291 def is_VOWEL_MOD(U, UISC, UDI, UGC, AJT): 292 return (UISC in [Tone_Mark, Cantillation_Mark, Register_Shifter, Visarga] or 293 UGC != Lo and UISC == Bindu) 294 def is_Word_Joiner(U, UISC, UDI, UGC, AJT): 295 # Also includes Rsv 296 return (UDI and U not in [0x115F, 0x1160, 0x3164, 0xFFA0, 0x1BCA0, 0x1BCA1, 0x1BCA2, 0x1BCA3] 297 and UISC == Other 298 and not is_CGJ(U, UISC, UDI, UGC, AJT) 299 ) or UGC == Cn 300 301 use_mapping = { 302 'B': is_BASE, 303 'N': is_BASE_NUM, 304 'GB': is_BASE_OTHER, 305 'CGJ': is_CGJ, 306 'F': is_CONS_FINAL, 307 'FM': is_CONS_FINAL_MOD, 308 'M': is_CONS_MED, 309 'CM': is_CONS_MOD, 310 'SUB': is_CONS_SUB, 311 'CS': is_CONS_WITH_STACKER, 312 'H': is_HALANT, 313 'HVM': is_HALANT_OR_VOWEL_MODIFIER, 314 'HN': is_HALANT_NUM, 315 'IS': is_INVISIBLE_STACKER, 316 'G': is_HIEROGLYPH, 317 'HM': is_HIEROGLYPH_MOD, 318 'HR': is_HIEROGLYPH_MIRROR, 319 'J': is_HIEROGLYPH_JOINER, 320 'SB': is_HIEROGLYPH_SEGMENT_BEGIN, 321 'SE': is_HIEROGLYPH_SEGMENT_END, 322 'ZWNJ': is_ZWNJ, 323 'O': is_OTHER, 324 'RK': is_REORDERING_KILLER, 325 'R': is_REPHA, 326 'Sk': is_SAKOT, 327 'SM': is_SYM_MOD, 328 'V': is_VOWEL, 329 'VM': is_VOWEL_MOD, 330 'WJ': is_Word_Joiner, 331 } 332 333 use_positions = { 334 'F': { 335 'Abv': [Top], 336 'Blw': [Bottom], 337 'Pst': [Right], 338 }, 339 'M': { 340 'Abv': [Top], 341 'Blw': [Bottom, Bottom_And_Left, Bottom_And_Right], 342 'Pst': [Right], 343 'Pre': [Left, Top_And_Bottom_And_Left], 344 }, 345 'CM': { 346 'Abv': [Top], 347 'Blw': [Bottom, Overstruck], 348 }, 349 'V': { 350 'Abv': [Top, Top_And_Bottom, Top_And_Bottom_And_Right, Top_And_Right], 351 'Blw': [Bottom, Overstruck, Bottom_And_Right], 352 'Pst': [Right], 353 'Pre': [Left, Top_And_Left, Top_And_Left_And_Right, Left_And_Right], 354 }, 355 'VM': { 356 'Abv': [Top], 357 'Blw': [Bottom, Overstruck], 358 'Pst': [Right], 359 'Pre': [Left], 360 }, 361 'SM': { 362 'Abv': [Top], 363 'Blw': [Bottom], 364 }, 365 'H': None, 366 'HM': None, 367 'HR': None, 368 'HVM': None, 369 'IS': None, 370 'B': None, 371 'FM': { 372 'Abv': [Top], 373 'Blw': [Bottom], 374 'Pst': [Not_Applicable], 375 }, 376 'R': None, 377 'RK': None, 378 'SUB': None, 379 } 380 381 def map_to_use(data): 382 out = {} 383 items = use_mapping.items() 384 for U, (UISC, UIPC, AJT, UDI, UGC, UBlock, _) in data.items(): 385 386 # Resolve Indic_Syllabic_Category 387 388 # TODO: These don't have UISC assigned in Unicode 13.0.0, but have UIPC 389 if 0x1CE2 <= U <= 0x1CE8: UISC = Cantillation_Mark 390 391 # Tibetan: 392 # TODO: These don't have UISC assigned in Unicode 13.0.0, but have UIPC 393 if 0x0F18 <= U <= 0x0F19 or 0x0F3E <= U <= 0x0F3F: UISC = Vowel_Dependent 394 395 # TODO: U+1CED should only be allowed after some of 396 # the nasalization marks, maybe only for U+1CE9..U+1CF1. 397 if U == 0x1CED: UISC = Tone_Mark 398 399 values = [k for k,v in items if v(U, UISC, UDI, UGC, AJT)] 400 assert len(values) == 1, "%s %s %s %s %s %s" % (hex(U), UISC, UDI, UGC, AJT, values) 401 USE = values[0] 402 403 # Resolve Indic_Positional_Category 404 405 # TODO: https://github.com/harfbuzz/harfbuzz/pull/1037 406 # and https://github.com/harfbuzz/harfbuzz/issues/1631 407 if U in [0x11302, 0x11303, 0x114C1]: UIPC = Top 408 409 assert (UIPC in [Not_Applicable, Visual_Order_Left] or 410 U in {0x0F7F, 0x11A3A} or 411 USE in use_positions), "%s %s %s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UDI, UGC, AJT) 412 413 pos_mapping = use_positions.get(USE, None) 414 if pos_mapping: 415 values = [k for k,v in pos_mapping.items() if v and UIPC in v] 416 assert len(values) == 1, "%s %s %s %s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UDI, UGC, AJT, values) 417 USE = USE + values[0] 418 419 out[U] = (USE, UBlock) 420 return out 421 422 use_data = map_to_use(combined) 423 424 print ("/* == Start of generated table == */") 425 print ("/*") 426 print (" * The following table is generated by running:") 427 print (" *") 428 print (" * {} IndicSyllabicCategory.txt IndicPositionalCategory.txt ArabicShaping.txt DerivedCoreProperties.txt UnicodeData.txt Blocks.txt Scripts.txt IndicSyllabicCategory-Additional.txt IndicPositionalCategory-Additional.txt".format (sys.argv[0])) 429 print (" *") 430 print (" * on files with these headers:") 431 print (" *") 432 for h in headers: 433 for l in h: 434 print (" * %s" % (l.strip())) 435 print (" */") 436 print () 437 if language.name == "c": 438 print ("#ifndef HB_OT_SHAPER_USE_TABLE_HH") 439 print ("#define HB_OT_SHAPER_USE_TABLE_HH") 440 print () 441 print ('#include "hb.hh"') 442 print () 443 print ('#include "hb-ot-shaper-use-machine.hh"') 444 print () 445 446 print ('#pragma GCC diagnostic push') 447 print ('#pragma GCC diagnostic ignored "-Wunused-macros"') 448 for k,v in sorted(use_mapping.items()): 449 if k in use_positions and use_positions[k]: continue 450 print ("#define %s USE(%s) /* %s */" % (k, k, v.__name__[3:])) 451 for k,v in sorted(use_positions.items()): 452 if not v: continue 453 for suf in v.keys(): 454 tag = k + suf 455 print ("#define %s USE(%s)" % (tag, tag)) 456 print ('#pragma GCC diagnostic pop') 457 print ("") 458 459 elif language.name == "rust": 460 print() 461 print("#![allow(unused_parens)]") 462 print("#![allow(clippy::unnecessary_cast, clippy::unreadable_literal, clippy::double_parens)]") 463 print() 464 print("use super::ot_shaper_use::category::*;") 465 print() 466 else: 467 assert False, "Unknown language: %s" % language.name 468 469 uu = sorted (use_data.keys ()) 470 471 data = {u:v[0] for u,v in use_data.items()} 472 473 if language.name == "c": 474 private = True 475 elif language.name == "rust": 476 private = False 477 language.public_function_linkage = "pub(crate)" 478 else: 479 assert False, "Unknown language: %s" % language.name 480 481 482 DEFAULT = "DEFAULT" 483 COMPACT = "COMPACT" 484 485 compression_level = { 486 DEFAULT: 5, 487 COMPACT: 9, 488 } 489 490 modes = {} 491 if language.name == "c": 492 modes[DEFAULT] = "#ifndef HB_OPTIMIZE_SIZE" 493 modes[COMPACT] = "#else" 494 modes[None] = "#endif" 495 else: 496 modes[DEFAULT] = "" 497 498 for step, text in modes.items(): 499 print() 500 if text: 501 print(text) 502 print() 503 if step is None: 504 continue 505 506 compression = compression_level[step] 507 logging.info(" Compression=%d:" % compression) 508 509 code = packTab.Code('hb_use') 510 sol = packTab.pack_table(data, compression=compression, default='O') 511 logging.info(' FullCost=%d' % (sol.fullCost)) 512 sol.genCode(code, f'get_category', language=language, private=private) 513 code.print_code(language=language, private=private) 514 print () 515 516 if language.name == "c": 517 print () 518 for k in sorted(use_mapping.keys()): 519 if k in use_positions and use_positions[k]: continue 520 print ("#undef %s" % k) 521 for k,v in sorted(use_positions.items()): 522 if not v: continue 523 for suf in v.keys(): 524 tag = k + suf 525 print ("#undef %s" % tag) 526 print () 527 print () 528 print ("#endif /* HB_OT_SHAPER_USE_TABLE_HH */") 529 elif language.name == "rust": 530 pass 531 else: 532 assert False, "Unknown language: %s" % language.name 533 print ("/* == End of generated table == */")