tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

gen-use-table.py (15145B)


      1 #!/usr/bin/env python3
      2 # flake8: noqa: F821
      3 
      4 """usage: ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt ArabicShaping.txt DerivedCoreProperties.txt UnicodeData.txt Blocks.txt Scripts.txt IndicSyllabicCategory-Additional.txt IndicPositionalCategory-Additional.txt
      5 
      6 Input files:
      7 * https://unicode.org/Public/UCD/latest/ucd/IndicSyllabicCategory.txt
      8 * https://unicode.org/Public/UCD/latest/ucd/IndicPositionalCategory.txt
      9 * https://unicode.org/Public/UCD/latest/ucd/ArabicShaping.txt
     10 * https://unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
     11 * https://unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
     12 * https://unicode.org/Public/UCD/latest/ucd/Blocks.txt
     13 * https://unicode.org/Public/UCD/latest/ucd/Scripts.txt
     14 * ms-use/IndicSyllabicCategory-Additional.txt
     15 * ms-use/IndicPositionalCategory-Additional.txt
     16 """
     17 
     18 import packTab
     19 
     20 import sys
     21 import logging
     22 
     23 logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO)
     24 
     25 if len(sys.argv) > 1 and sys.argv[1] == "--rust":
     26    del sys.argv[1]
     27    logging.info("Generating Rust code...")
     28    language = "rust"
     29 else:
     30    logging.info("Generating C code...")
     31    language = "c"
     32 language = packTab.languages[language]
     33 
     34 import sys
     35 
     36 if len (sys.argv) != 10:
     37 sys.exit (__doc__)
     38 
     39 DISABLED_SCRIPTS = {
     40 'Arabic',
     41 'Lao',
     42 'Samaritan',
     43 'Syriac',
     44 'Thai',
     45 }
     46 
     47 files = [open (x, encoding='utf-8') for x in sys.argv[1:]]
     48 
     49 headers = [[f.readline () for i in range (2)] for j,f in enumerate(files) if j != 4]
     50 for j in range(7, 9):
     51 for line in files[j]:
     52 	line = line.rstrip()
     53 	if not line:
     54 		break
     55 	headers[j - 1].append(line)
     56 headers.append (["UnicodeData.txt does not have a header."])
     57 
     58 unicode_data = [{} for _ in files]
     59 values = [{} for _ in files]
     60 for i, f in enumerate (files):
     61 for line in f:
     62 
     63 	j = line.find ('#')
     64 	if j >= 0:
     65 		line = line[:j]
     66 
     67 	fields = [x.strip () for x in line.split (';')]
     68 	if len (fields) == 1:
     69 		continue
     70 
     71 	uu = fields[0].split ('..')
     72 	start = int (uu[0], 16)
     73 	if len (uu) == 1:
     74 		end = start
     75 	else:
     76 		end = int (uu[1], 16)
     77 
     78 	t = fields[1 if i not in [2, 4] else 2]
     79 
     80 	if i == 2:
     81 		t = 'jt_' + t
     82 	elif i == 3 and t != 'Default_Ignorable_Code_Point':
     83 		continue
     84 	elif i == 7 and t == 'Consonant_Final_Modifier':
     85 		# TODO: https://github.com/MicrosoftDocs/typography-issues/issues/336
     86 		t = 'Syllable_Modifier'
     87 	elif i == 8 and t == 'NA':
     88 		t = 'Not_Applicable'
     89 
     90 	i0 = i if i < 7 else i - 7
     91 	for u in range (start, end + 1):
     92 		unicode_data[i0][u] = t
     93 	values[i0][t] = values[i0].get (t, 0) + end - start + 1
     94 
     95 defaults = ('Other', 'Not_Applicable', 'jt_X', '', 'Cn', 'No_Block', 'Unknown')
     96 
     97 # Merge data into one dict:
     98 for i,v in enumerate (defaults):
     99 values[i][v] = values[i].get (v, 0) + 1
    100 combined = {}
    101 for i,d in enumerate (unicode_data):
    102 for u,v in d.items ():
    103 	if not u in combined:
    104 		if i >= 4:
    105 			continue
    106 		combined[u] = list (defaults)
    107 	combined[u][i] = v
    108 combined = {k: v for k, v in combined.items() if v[6] not in DISABLED_SCRIPTS}
    109 
    110 
    111 property_names = [
    112 # General_Category
    113 'Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', 'Mc',
    114 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po',
    115 'Ps', 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs',
    116 # Indic_Syllabic_Category
    117 'Other',
    118 'Bindu',
    119 'Visarga',
    120 'Avagraha',
    121 'Nukta',
    122 'Virama',
    123 'Pure_Killer',
    124 'Reordering_Killer',
    125 'Invisible_Stacker',
    126 'Vowel_Independent',
    127 'Vowel_Dependent',
    128 'Vowel',
    129 'Consonant_Placeholder',
    130 'Consonant',
    131 'Consonant_Dead',
    132 'Consonant_With_Stacker',
    133 'Consonant_Prefixed',
    134 'Consonant_Preceding_Repha',
    135 'Consonant_Succeeding_Repha',
    136 'Consonant_Subjoined',
    137 'Consonant_Medial',
    138 'Consonant_Final',
    139 'Consonant_Head_Letter',
    140 'Consonant_Initial_Postfixed',
    141 'Modifying_Letter',
    142 'Tone_Letter',
    143 'Tone_Mark',
    144 'Gemination_Mark',
    145 'Cantillation_Mark',
    146 'Register_Shifter',
    147 'Syllable_Modifier',
    148 'Consonant_Killer',
    149 'Non_Joiner',
    150 'Joiner',
    151 'Number_Joiner',
    152 'Number',
    153 'Brahmi_Joining_Number',
    154 'Symbol_Modifier',
    155 'Hieroglyph',
    156 'Hieroglyph_Joiner',
    157 'Hieroglyph_Mark_Begin',
    158 'Hieroglyph_Mark_End',
    159 'Hieroglyph_Mirror',
    160 'Hieroglyph_Modifier',
    161 'Hieroglyph_Segment_Begin',
    162 'Hieroglyph_Segment_End',
    163 # Indic_Positional_Category
    164 'Not_Applicable',
    165 'Right',
    166 'Left',
    167 'Visual_Order_Left',
    168 'Left_And_Right',
    169 'Top',
    170 'Bottom',
    171 'Top_And_Bottom',
    172 'Top_And_Bottom_And_Left',
    173 'Top_And_Right',
    174 'Top_And_Left',
    175 'Top_And_Left_And_Right',
    176 'Bottom_And_Left',
    177 'Bottom_And_Right',
    178 'Top_And_Bottom_And_Right',
    179 'Overstruck',
    180 # Joining_Type
    181 'jt_C',
    182 'jt_D',
    183 'jt_L',
    184 'jt_R',
    185 'jt_T',
    186 'jt_U',
    187 'jt_X',
    188 ]
    189 
    190 class PropertyValue(object):
    191 def __init__(self, name_):
    192 	self.name = name_
    193 def __str__(self):
    194 	return self.name
    195 def __eq__(self, other):
    196 	return self.name == (other if isinstance(other, str) else other.name)
    197 def __ne__(self, other):
    198 	return not (self == other)
    199 def __hash__(self):
    200 	return hash(str(self))
    201 
    202 property_values = {}
    203 
    204 for name in property_names:
    205 value = PropertyValue(name)
    206 assert value not in property_values
    207 assert value not in globals()
    208 property_values[name] = value
    209 globals().update(property_values)
    210 
    211 
    212 def is_BASE(U, UISC, UDI, UGC, AJT):
    213 return (UISC in [Number, Consonant, Consonant_Head_Letter,
    214 		Tone_Letter,
    215 		Vowel_Independent,
    216 		] or
    217 	# TODO: https://github.com/MicrosoftDocs/typography-issues/issues/484
    218 	AJT in [jt_C, jt_D, jt_L, jt_R] and UISC != Joiner or
    219 	(UGC == Lo and UISC in [Avagraha, Bindu, Consonant_Final, Consonant_Medial,
    220 				Consonant_Subjoined, Vowel, Vowel_Dependent]))
    221 def is_BASE_NUM(U, UISC, UDI, UGC, AJT):
    222 return UISC == Brahmi_Joining_Number
    223 def is_BASE_OTHER(U, UISC, UDI, UGC, AJT):
    224 if UISC == Consonant_Placeholder: return True
    225 return U in [0x2015, 0x2022, 0x25FB, 0x25FC, 0x25FD, 0x25FE]
    226 def is_CGJ(U, UISC, UDI, UGC, AJT):
    227 # Also includes VARIATION_SELECTOR and ZWJ
    228 return UISC == Joiner or UDI and UGC in [Mc, Me, Mn]
    229 def is_CONS_FINAL(U, UISC, UDI, UGC, AJT):
    230 return ((UISC == Consonant_Final and UGC != Lo) or
    231 	UISC == Consonant_Succeeding_Repha)
    232 def is_CONS_FINAL_MOD(U, UISC, UDI, UGC, AJT):
    233 return UISC == Syllable_Modifier
    234 def is_CONS_MED(U, UISC, UDI, UGC, AJT):
    235 # Consonant_Initial_Postfixed is new in Unicode 11; not in the spec.
    236 return (UISC == Consonant_Medial and UGC != Lo or
    237 	UISC == Consonant_Initial_Postfixed)
    238 def is_CONS_MOD(U, UISC, UDI, UGC, AJT):
    239 return UISC in [Nukta, Gemination_Mark, Consonant_Killer]
    240 def is_CONS_SUB(U, UISC, UDI, UGC, AJT):
    241 return UISC == Consonant_Subjoined and UGC != Lo
    242 def is_CONS_WITH_STACKER(U, UISC, UDI, UGC, AJT):
    243 return UISC == Consonant_With_Stacker
    244 def is_HALANT(U, UISC, UDI, UGC, AJT):
    245 return UISC == Virama and not is_HALANT_OR_VOWEL_MODIFIER(U, UISC, UDI, UGC, AJT)
    246 def is_HALANT_OR_VOWEL_MODIFIER(U, UISC, UDI, UGC, AJT):
    247 # Split off of HALANT
    248 return U == 0x0DCA
    249 def is_HALANT_NUM(U, UISC, UDI, UGC, AJT):
    250 return UISC == Number_Joiner
    251 def is_HIEROGLYPH(U, UISC, UDI, UGC, AJT):
    252 return UISC == Hieroglyph
    253 def is_HIEROGLYPH_JOINER(U, UISC, UDI, UGC, AJT):
    254 return UISC == Hieroglyph_Joiner
    255 def is_HIEROGLYPH_MIRROR(U, UISC, UDI, UGC, AJT):
    256 return UISC == Hieroglyph_Mirror
    257 def is_HIEROGLYPH_MOD(U, UISC, UDI, UGC, AJT):
    258 return UISC == Hieroglyph_Modifier
    259 def is_HIEROGLYPH_SEGMENT_BEGIN(U, UISC, UDI, UGC, AJT):
    260 return UISC in [Hieroglyph_Mark_Begin, Hieroglyph_Segment_Begin]
    261 def is_HIEROGLYPH_SEGMENT_END(U, UISC, UDI, UGC, AJT):
    262 return UISC in [Hieroglyph_Mark_End, Hieroglyph_Segment_End]
    263 def is_INVISIBLE_STACKER(U, UISC, UDI, UGC, AJT):
    264 # Split off of HALANT
    265 return (UISC == Invisible_Stacker
    266 	and not is_SAKOT(U, UISC, UDI, UGC, AJT)
    267 )
    268 def is_ZWNJ(U, UISC, UDI, UGC, AJT):
    269 return UISC == Non_Joiner
    270 def is_OTHER(U, UISC, UDI, UGC, AJT):
    271 # Also includes BASE_IND and SYM
    272 return ((UGC == Po or UISC in [Consonant_Dead, Joiner, Modifying_Letter, Other])
    273 	and not is_BASE(U, UISC, UDI, UGC, AJT)
    274 	and not is_BASE_OTHER(U, UISC, UDI, UGC, AJT)
    275 	and not is_CGJ(U, UISC, UDI, UGC, AJT)
    276 	and not is_SYM_MOD(U, UISC, UDI, UGC, AJT)
    277 	and not is_Word_Joiner(U, UISC, UDI, UGC, AJT)
    278 )
    279 def is_REORDERING_KILLER(U, UISC, UDI, UGC, AJT):
    280 return UISC == Reordering_Killer
    281 def is_REPHA(U, UISC, UDI, UGC, AJT):
    282 return UISC in [Consonant_Preceding_Repha, Consonant_Prefixed]
    283 def is_SAKOT(U, UISC, UDI, UGC, AJT):
    284 # Split off of HALANT
    285 return U == 0x1A60
    286 def is_SYM_MOD(U, UISC, UDI, UGC, AJT):
    287 return UISC == Symbol_Modifier
    288 def is_VOWEL(U, UISC, UDI, UGC, AJT):
    289 return (UISC == Pure_Killer or
    290 	UGC != Lo and UISC in [Vowel, Vowel_Dependent])
    291 def is_VOWEL_MOD(U, UISC, UDI, UGC, AJT):
    292 return (UISC in [Tone_Mark, Cantillation_Mark, Register_Shifter, Visarga] or
    293 	UGC != Lo and UISC == Bindu)
    294 def is_Word_Joiner(U, UISC, UDI, UGC, AJT):
    295 # Also includes Rsv
    296 return (UDI and U not in [0x115F, 0x1160, 0x3164, 0xFFA0, 0x1BCA0, 0x1BCA1, 0x1BCA2, 0x1BCA3]
    297 	and UISC == Other
    298 	and not is_CGJ(U, UISC, UDI, UGC, AJT)
    299 ) or UGC == Cn
    300 
    301 use_mapping = {
    302 'B':	is_BASE,
    303 'N':	is_BASE_NUM,
    304 'GB':	is_BASE_OTHER,
    305 'CGJ':	is_CGJ,
    306 'F':	is_CONS_FINAL,
    307 'FM':	is_CONS_FINAL_MOD,
    308 'M':	is_CONS_MED,
    309 'CM':	is_CONS_MOD,
    310 'SUB':	is_CONS_SUB,
    311 'CS':	is_CONS_WITH_STACKER,
    312 'H':	is_HALANT,
    313 'HVM':	is_HALANT_OR_VOWEL_MODIFIER,
    314 'HN':	is_HALANT_NUM,
    315 'IS':	is_INVISIBLE_STACKER,
    316 'G':	is_HIEROGLYPH,
    317 'HM':	is_HIEROGLYPH_MOD,
    318 'HR':	is_HIEROGLYPH_MIRROR,
    319 'J':	is_HIEROGLYPH_JOINER,
    320 'SB':	is_HIEROGLYPH_SEGMENT_BEGIN,
    321 'SE':	is_HIEROGLYPH_SEGMENT_END,
    322 'ZWNJ':	is_ZWNJ,
    323 'O':	is_OTHER,
    324 'RK':	is_REORDERING_KILLER,
    325 'R':	is_REPHA,
    326 'Sk':	is_SAKOT,
    327 'SM':	is_SYM_MOD,
    328 'V':	is_VOWEL,
    329 'VM':	is_VOWEL_MOD,
    330 'WJ':	is_Word_Joiner,
    331 }
    332 
    333 use_positions = {
    334 'F': {
    335 	'Abv': [Top],
    336 	'Blw': [Bottom],
    337 	'Pst': [Right],
    338 },
    339 'M': {
    340 	'Abv': [Top],
    341 	'Blw': [Bottom, Bottom_And_Left, Bottom_And_Right],
    342 	'Pst': [Right],
    343 	'Pre': [Left, Top_And_Bottom_And_Left],
    344 },
    345 'CM': {
    346 	'Abv': [Top],
    347 	'Blw': [Bottom, Overstruck],
    348 },
    349 'V': {
    350 	'Abv': [Top, Top_And_Bottom, Top_And_Bottom_And_Right, Top_And_Right],
    351 	'Blw': [Bottom, Overstruck, Bottom_And_Right],
    352 	'Pst': [Right],
    353 	'Pre': [Left, Top_And_Left, Top_And_Left_And_Right, Left_And_Right],
    354 },
    355 'VM': {
    356 	'Abv': [Top],
    357 	'Blw': [Bottom, Overstruck],
    358 	'Pst': [Right],
    359 	'Pre': [Left],
    360 },
    361 'SM': {
    362 	'Abv': [Top],
    363 	'Blw': [Bottom],
    364 },
    365 'H': None,
    366 'HM': None,
    367 'HR': None,
    368 'HVM': None,
    369 'IS': None,
    370 'B': None,
    371 'FM': {
    372 	'Abv': [Top],
    373 	'Blw': [Bottom],
    374 	'Pst': [Not_Applicable],
    375 },
    376 'R': None,
    377 'RK': None,
    378 'SUB': None,
    379 }
    380 
    381 def map_to_use(data):
    382 out = {}
    383 items = use_mapping.items()
    384 for U, (UISC, UIPC, AJT, UDI, UGC, UBlock, _) in data.items():
    385 
    386 	# Resolve Indic_Syllabic_Category
    387 
    388 	# TODO: These don't have UISC assigned in Unicode 13.0.0, but have UIPC
    389 	if 0x1CE2 <= U <= 0x1CE8: UISC = Cantillation_Mark
    390 
    391 	# Tibetan:
    392 	# TODO: These don't have UISC assigned in Unicode 13.0.0, but have UIPC
    393 	if 0x0F18 <= U <= 0x0F19 or 0x0F3E <= U <= 0x0F3F: UISC = Vowel_Dependent
    394 
    395 	# TODO: U+1CED should only be allowed after some of
    396 	# the nasalization marks, maybe only for U+1CE9..U+1CF1.
    397 	if U == 0x1CED: UISC = Tone_Mark
    398 
    399 	values = [k for k,v in items if v(U, UISC, UDI, UGC, AJT)]
    400 	assert len(values) == 1, "%s %s %s %s %s %s" % (hex(U), UISC, UDI, UGC, AJT, values)
    401 	USE = values[0]
    402 
    403 	# Resolve Indic_Positional_Category
    404 
    405 	# TODO: https://github.com/harfbuzz/harfbuzz/pull/1037
    406 	#  and https://github.com/harfbuzz/harfbuzz/issues/1631
    407 	if U in [0x11302, 0x11303, 0x114C1]: UIPC = Top
    408 
    409 	assert (UIPC in [Not_Applicable, Visual_Order_Left] or
    410 		U in {0x0F7F, 0x11A3A} or
    411 		USE in use_positions), "%s %s %s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UDI, UGC, AJT)
    412 
    413 	pos_mapping = use_positions.get(USE, None)
    414 	if pos_mapping:
    415 		values = [k for k,v in pos_mapping.items() if v and UIPC in v]
    416 		assert len(values) == 1, "%s %s %s %s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UDI, UGC, AJT, values)
    417 		USE = USE + values[0]
    418 
    419 	out[U] = (USE, UBlock)
    420 return out
    421 
    422 use_data = map_to_use(combined)
    423 
    424 print ("/* == Start of generated table == */")
    425 print ("/*")
    426 print (" * The following table is generated by running:")
    427 print (" *")
    428 print (" *   {} IndicSyllabicCategory.txt IndicPositionalCategory.txt ArabicShaping.txt DerivedCoreProperties.txt UnicodeData.txt Blocks.txt Scripts.txt IndicSyllabicCategory-Additional.txt IndicPositionalCategory-Additional.txt".format (sys.argv[0]))
    429 print (" *")
    430 print (" * on files with these headers:")
    431 print (" *")
    432 for h in headers:
    433 for l in h:
    434 	print (" * %s" % (l.strip()))
    435 print (" */")
    436 print ()
    437 if language.name == "c":
    438    print ("#ifndef HB_OT_SHAPER_USE_TABLE_HH")
    439    print ("#define HB_OT_SHAPER_USE_TABLE_HH")
    440    print ()
    441    print ('#include "hb.hh"')
    442    print ()
    443    print ('#include "hb-ot-shaper-use-machine.hh"')
    444    print ()
    445 
    446    print ('#pragma GCC diagnostic push')
    447    print ('#pragma GCC diagnostic ignored "-Wunused-macros"')
    448    for k,v in sorted(use_mapping.items()):
    449        if k in use_positions and use_positions[k]: continue
    450        print ("#define %s	USE(%s)	/* %s */" % (k, k, v.__name__[3:]))
    451    for k,v in sorted(use_positions.items()):
    452        if not v: continue
    453        for suf in v.keys():
    454            tag = k + suf
    455            print ("#define %s	USE(%s)" % (tag, tag))
    456    print ('#pragma GCC diagnostic pop')
    457    print ("")
    458 
    459 elif language.name == "rust":
    460    print()
    461    print("#![allow(unused_parens)]")
    462    print("#![allow(clippy::unnecessary_cast, clippy::unreadable_literal, clippy::double_parens)]")
    463    print()
    464    print("use super::ot_shaper_use::category::*;")
    465    print()
    466 else:
    467    assert False, "Unknown language: %s" % language.name
    468 
    469 uu = sorted (use_data.keys ())
    470 
    471 data = {u:v[0] for u,v in use_data.items()}
    472 
    473 if language.name == "c":
    474    private = True
    475 elif language.name == "rust":
    476    private = False
    477    language.public_function_linkage = "pub(crate)"
    478 else:
    479    assert False, "Unknown language: %s" % language.name
    480 
    481 
    482 DEFAULT = "DEFAULT"
    483 COMPACT = "COMPACT"
    484 
    485 compression_level = {
    486    DEFAULT: 5,
    487    COMPACT: 9,
    488 }
    489 
    490 modes = {}
    491 if language.name == "c":
    492    modes[DEFAULT] = "#ifndef HB_OPTIMIZE_SIZE"
    493    modes[COMPACT] = "#else"
    494    modes[None] = "#endif"
    495 else:
    496    modes[DEFAULT] = ""
    497 
    498 for step, text in modes.items():
    499    print()
    500    if text:
    501        print(text)
    502        print()
    503    if step is None:
    504        continue
    505 
    506    compression = compression_level[step]
    507    logging.info("  Compression=%d:" % compression)
    508 
    509    code = packTab.Code('hb_use')
    510    sol = packTab.pack_table(data, compression=compression, default='O')
    511    logging.info('      FullCost=%d' % (sol.fullCost))
    512    sol.genCode(code, f'get_category', language=language, private=private)
    513    code.print_code(language=language, private=private)
    514    print ()
    515 
    516 if language.name == "c":
    517    print ()
    518    for k in sorted(use_mapping.keys()):
    519        if k in use_positions and use_positions[k]: continue
    520        print ("#undef %s" % k)
    521    for k,v in sorted(use_positions.items()):
    522        if not v: continue
    523        for suf in v.keys():
    524            tag = k + suf
    525            print ("#undef %s" % tag)
    526    print ()
    527    print ()
    528    print ("#endif /* HB_OT_SHAPER_USE_TABLE_HH */")
    529 elif language.name == "rust":
    530    pass
    531 else:
    532    assert False, "Unknown language: %s" % language.name
    533 print ("/* == End of generated table == */")