tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

gen-indic-table.py (17498B)


      1 #!/usr/bin/env python3
      2 
      3 """usage: ./gen-indic-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt Blocks.txt
      4 
      5 Input files:
      6 * https://unicode.org/Public/UCD/latest/ucd/IndicSyllabicCategory.txt
      7 * https://unicode.org/Public/UCD/latest/ucd/IndicPositionalCategory.txt
      8 * https://unicode.org/Public/UCD/latest/ucd/Blocks.txt
      9 """
     10 
     11 import sys
     12 
     13 if len (sys.argv) != 4:
     14 sys.exit (__doc__)
     15 
     16 ALLOWED_SINGLES = [0x00A0, 0x25CC]
     17 ALLOWED_BLOCKS = [
     18 'Basic Latin',
     19 'Latin-1 Supplement',
     20 'Devanagari',
     21 'Bengali',
     22 'Gurmukhi',
     23 'Gujarati',
     24 'Oriya',
     25 'Tamil',
     26 'Telugu',
     27 'Kannada',
     28 'Malayalam',
     29 'Myanmar',
     30 'Khmer',
     31 'Vedic Extensions',
     32 'General Punctuation',
     33 'Superscripts and Subscripts',
     34 'Devanagari Extended',
     35 'Myanmar Extended-B',
     36 'Myanmar Extended-A',
     37 'Myanmar Extended-C',
     38 ]
     39 
     40 files = [open (x, encoding='utf-8') for x in sys.argv[1:]]
     41 
     42 headers = [[f.readline () for i in range (2)] for f in files]
     43 
     44 unicode_data = [{} for _ in files]
     45 for i, f in enumerate (files):
     46 for line in f:
     47 
     48 	j = line.find ('#')
     49 	if j >= 0:
     50 		line = line[:j]
     51 
     52 	fields = [x.strip () for x in line.split (';')]
     53 	if len (fields) == 1:
     54 		continue
     55 
     56 	uu = fields[0].split ('..')
     57 	start = int (uu[0], 16)
     58 	if len (uu) == 1:
     59 		end = start
     60 	else:
     61 		end = int (uu[1], 16)
     62 
     63 	t = fields[1]
     64 
     65 	for u in range (start, end + 1):
     66 		unicode_data[i][u] = t
     67 
     68 # Merge data into one dict:
     69 defaults = ('Other', 'Not_Applicable', 'No_Block')
     70 combined = {}
     71 for i,d in enumerate (unicode_data):
     72 for u,v in d.items ():
     73 	if i == 2 and not u in combined:
     74 		continue
     75 	if not u in combined:
     76 		combined[u] = list (defaults)
     77 	combined[u][i] = v
     78 combined = {k:v for k,v in combined.items() if k in ALLOWED_SINGLES or v[2] in ALLOWED_BLOCKS}
     79 
     80 
     81 # Convert categories & positions types
     82 
     83 categories = {
     84  'indic' : [
     85    'X',
     86    'C',
     87    'V',
     88    'N',
     89    'H',
     90    'ZWNJ',
     91    'ZWJ',
     92    'M',
     93    'SM',
     94    'A',
     95    'VD',
     96    'PLACEHOLDER',
     97    'DOTTEDCIRCLE',
     98    'RS',
     99    'MPst',
    100    'Repha',
    101    'Ra',
    102    'CM',
    103    'Symbol',
    104    'CS',
    105    'SMPst',
    106  ],
    107  'khmer' : [
    108    'VAbv',
    109    'VBlw',
    110    'VPre',
    111    'VPst',
    112 
    113    'Robatic',
    114    'Xgroup',
    115    'Ygroup',
    116  ],
    117  'myanmar' : [
    118    'VAbv',
    119    'VBlw',
    120    'VPre',
    121    'VPst',
    122 
    123    'IV',
    124    'As',
    125    'DB',
    126    'GB',
    127    'MH',
    128    'MR',
    129    'MW',
    130    'MY',
    131    'PT',
    132    'VS',
    133    'ML',
    134  ],
    135 }
    136 
    137 category_map = {
    138  'Other'			: 'X',
    139  'Avagraha'			: 'Symbol',
    140  'Bindu'			: 'SM',
    141  'Brahmi_Joining_Number'	: 'PLACEHOLDER', # Don't care.
    142  'Cantillation_Mark'		: 'A',
    143  'Consonant'			: 'C',
    144  'Consonant_Dead'		: 'C',
    145  'Consonant_Final'		: 'CM',
    146  'Consonant_Head_Letter'	: 'C',
    147  'Consonant_Initial_Postfixed'	: 'C', # TODO
    148  'Consonant_Killer'		: 'M', # U+17CD only.
    149  'Consonant_Medial'		: 'CM',
    150  'Consonant_Placeholder'	: 'PLACEHOLDER',
    151  'Consonant_Preceding_Repha'	: 'Repha',
    152  'Consonant_Prefixed'		: 'X', # Don't care.
    153  'Consonant_Subjoined'		: 'CM',
    154  'Consonant_Succeeding_Repha'	: 'CM',
    155  'Consonant_With_Stacker'	: 'CS',
    156  'Gemination_Mark'		: 'SM', # https://github.com/harfbuzz/harfbuzz/issues/552
    157  'Invisible_Stacker'		: 'H',
    158  'Joiner'			: 'ZWJ',
    159  'Modifying_Letter'		: 'X',
    160  'Non_Joiner'			: 'ZWNJ',
    161  'Nukta'			: 'N',
    162  'Number'			: 'PLACEHOLDER',
    163  'Number_Joiner'		: 'PLACEHOLDER', # Don't care.
    164  'Pure_Killer'			: 'M', # Is like a vowel matra.
    165  'Register_Shifter'		: 'RS',
    166  'Syllable_Modifier'		: 'SM',
    167  'Tone_Letter'			: 'X',
    168  'Tone_Mark'			: 'N',
    169  'Virama'			: 'H',
    170  'Visarga'			: 'SM',
    171  'Vowel'			: 'V',
    172  'Vowel_Dependent'		: 'M',
    173  'Vowel_Independent'		: 'V',
    174 }
    175 position_map = {
    176  'Not_Applicable'		: 'END',
    177 
    178  'Left'			: 'PRE_C',
    179  'Top'				: 'ABOVE_C',
    180  'Bottom'			: 'BELOW_C',
    181  'Right'			: 'POST_C',
    182 
    183  # These should resolve to the position of the last part of the split sequence.
    184  'Bottom_And_Right'		: 'POST_C',
    185  'Left_And_Right'		: 'POST_C',
    186  'Top_And_Bottom'		: 'BELOW_C',
    187  'Top_And_Bottom_And_Left'	: 'BELOW_C',
    188  'Top_And_Bottom_And_Right'	: 'POST_C',
    189  'Top_And_Left'		: 'ABOVE_C',
    190  'Top_And_Left_And_Right'	: 'POST_C',
    191  'Top_And_Right'		: 'POST_C',
    192 
    193  'Overstruck'			: 'AFTER_MAIN',
    194  'Visual_order_left'		: 'PRE_M',
    195 }
    196 
    197 category_overrides = {
    198 
    199  # These are the variation-selectors. They only appear in the Myanmar grammar
    200  # but are not Myanmar-specific
    201  0xFE00: 'VS',
    202  0xFE01: 'VS',
    203  0xFE02: 'VS',
    204  0xFE03: 'VS',
    205  0xFE04: 'VS',
    206  0xFE05: 'VS',
    207  0xFE06: 'VS',
    208  0xFE07: 'VS',
    209  0xFE08: 'VS',
    210  0xFE09: 'VS',
    211  0xFE0A: 'VS',
    212  0xFE0B: 'VS',
    213  0xFE0C: 'VS',
    214  0xFE0D: 'VS',
    215  0xFE0E: 'VS',
    216  0xFE0F: 'VS',
    217 
    218  # These appear in the OT Myanmar spec, but are not Myanmar-specific
    219  0x2015: 'PLACEHOLDER',
    220  0x2022: 'PLACEHOLDER',
    221  0x25FB: 'PLACEHOLDER',
    222  0x25FC: 'PLACEHOLDER',
    223  0x25FD: 'PLACEHOLDER',
    224  0x25FE: 'PLACEHOLDER',
    225 
    226 
    227  # Indic
    228 
    229  0x0930: 'Ra', # Devanagari
    230  0x09B0: 'Ra', # Bengali
    231  0x09F0: 'Ra', # Bengali
    232  0x0A30: 'Ra', # Gurmukhi 	No Reph
    233  0x0AB0: 'Ra', # Gujarati
    234  0x0B30: 'Ra', # Oriya
    235  0x0BB0: 'Ra', # Tamil 	No Reph
    236  0x0C30: 'Ra', # Telugu 	Reph formed only with ZWJ
    237  0x0CB0: 'Ra', # Kannada
    238  0x0D30: 'Ra', # Malayalam 	No Reph, Logical Repha
    239 
    240  # The following act more like the Bindus.
    241  0x0953: 'SM',
    242  0x0954: 'SM',
    243 
    244  # U+0A40 GURMUKHI VOWEL SIGN II may be preceded by U+0A02 GURMUKHI SIGN BINDI.
    245  0x0A40: 'MPst',
    246 
    247  # The following act like consonants.
    248  0x0A72: 'C',
    249  0x0A73: 'C',
    250  0x1CF5: 'C',
    251  0x1CF6: 'C',
    252 
    253  # TODO: The following should only be allowed after a Visarga.
    254  # For now, just treat them like regular tone marks.
    255  0x1CE2: 'A',
    256  0x1CE3: 'A',
    257  0x1CE4: 'A',
    258  0x1CE5: 'A',
    259  0x1CE6: 'A',
    260  0x1CE7: 'A',
    261  0x1CE8: 'A',
    262 
    263  # TODO: The following should only be allowed after some of
    264  # the nasalization marks, maybe only for U+1CE9..U+1CF1.
    265  # For now, just treat them like tone marks.
    266  0x1CED: 'A',
    267 
    268  # The following take marks in standalone clusters, similar to Avagraha.
    269  0xA8F2: 'Symbol',
    270  0xA8F3: 'Symbol',
    271  0xA8F4: 'Symbol',
    272  0xA8F5: 'Symbol',
    273  0xA8F6: 'Symbol',
    274  0xA8F7: 'Symbol',
    275  0x1CE9: 'Symbol',
    276  0x1CEA: 'Symbol',
    277  0x1CEB: 'Symbol',
    278  0x1CEC: 'Symbol',
    279  0x1CEE: 'Symbol',
    280  0x1CEF: 'Symbol',
    281  0x1CF0: 'Symbol',
    282  0x1CF1: 'Symbol',
    283 
    284  0x0A51: 'M', # https://github.com/harfbuzz/harfbuzz/issues/524
    285 
    286  # According to ScriptExtensions.txt, these Grantha marks may also be used in Tamil,
    287  # so the Indic shaper needs to know their categories.
    288  0x11301: 'SM',
    289  0x11302: 'SM',
    290  0x11303: 'SM',
    291  0x1133B: 'N',
    292  0x1133C: 'N',
    293 
    294  0x0AFB: 'N', # https://github.com/harfbuzz/harfbuzz/issues/552
    295  0x0B55: 'N', # https://github.com/harfbuzz/harfbuzz/issues/2849
    296 
    297  0x09FC: 'PLACEHOLDER', # https://github.com/harfbuzz/harfbuzz/pull/1613
    298  0x0C80: 'PLACEHOLDER', # https://github.com/harfbuzz/harfbuzz/pull/623
    299  0x0D04: 'PLACEHOLDER', # https://github.com/harfbuzz/harfbuzz/pull/3511
    300 
    301  0x25CC: 'DOTTEDCIRCLE',
    302 
    303 
    304  # Khmer
    305 
    306  0x179A: 'Ra',
    307 
    308  0x17CC: 'Robatic',
    309  0x17C9: 'Robatic',
    310  0x17CA: 'Robatic',
    311 
    312  0x17C6: 'Xgroup',
    313  0x17CB: 'Xgroup',
    314  0x17CD: 'Xgroup',
    315  0x17CE: 'Xgroup',
    316  0x17CF: 'Xgroup',
    317  0x17D0: 'Xgroup',
    318  0x17D1: 'Xgroup',
    319 
    320  0x17C7: 'Ygroup',
    321  0x17C8: 'Ygroup',
    322  0x17DD: 'Ygroup',
    323  0x17D3: 'Ygroup', # Just guessing. Uniscribe doesn't categorize it.
    324 
    325  0x17D9: 'PLACEHOLDER', # https://github.com/harfbuzz/harfbuzz/issues/2384
    326 
    327 
    328  # Myanmar
    329 
    330  # https://docs.microsoft.com/en-us/typography/script-development/myanmar#analyze
    331 
    332  0x104E: 'C', # The spec says C, IndicSyllableCategory says Consonant_Placeholder
    333 
    334  0x1004: 'Ra',
    335  0x101B: 'Ra',
    336  0x105A: 'Ra',
    337 
    338  0x1032: 'A',
    339  0x1036: 'A',
    340 
    341  0x103A: 'As',
    342 
    343  #0x1040: 'D0', # XXX The spec says D0, but Uniscribe doesn't seem to do.
    344 
    345  0x103E: 'MH',
    346  0x1060: 'ML',
    347  0x103C: 'MR',
    348  0x103D: 'MW',
    349  0x1082: 'MW',
    350  0x103B: 'MY',
    351  0x105E: 'MY',
    352  0x105F: 'MY',
    353 
    354  0x1063: 'PT',
    355  0x1064: 'PT',
    356  0x1069: 'PT',
    357  0x106A: 'PT',
    358  0x106B: 'PT',
    359  0x106C: 'PT',
    360  0x106D: 'PT',
    361  0xAA7B: 'PT',
    362 
    363  0x1038: 'SM',
    364  0x1087: 'SM',
    365  0x1088: 'SM',
    366  0x1089: 'SM',
    367  0x108A: 'SM',
    368  0x108B: 'SM',
    369  0x108C: 'SM',
    370  0x108D: 'SM',
    371  0x108F: 'SM',
    372  0x109A: 'SM',
    373  0x109B: 'SM',
    374  0x109C: 'SM',
    375 
    376  0x104A: 'PLACEHOLDER',
    377 }
    378 position_overrides = {
    379 
    380  0x0A51: 'BELOW_C', # https://github.com/harfbuzz/harfbuzz/issues/524
    381 
    382  0x0B01: 'BEFORE_SUB', # Oriya Bindu is BeforeSub in the spec.
    383 }
    384 
    385 def matra_pos_left(u, block):
    386  return "PRE_M"
    387 def matra_pos_right(u, block):
    388  if block == 'Devanagari':	return  'AFTER_SUB'
    389  if block == 'Bengali':	return  'AFTER_POST'
    390  if block == 'Gurmukhi':	return  'AFTER_POST'
    391  if block == 'Gujarati':	return  'AFTER_POST'
    392  if block == 'Oriya':		return  'AFTER_POST'
    393  if block == 'Tamil':		return  'AFTER_POST'
    394  if block == 'Telugu':		return  'BEFORE_SUB' if u <= 0x0C42 else 'AFTER_SUB'
    395  if block == 'Kannada':	return  'BEFORE_SUB' if u < 0x0CC3 or u > 0x0CD6 else 'AFTER_SUB'
    396  if block == 'Malayalam':	return  'AFTER_POST'
    397  return 'AFTER_SUB'
    398 def matra_pos_top(u, block):
    399  # BENG and MLYM don't have top matras.
    400  if block == 'Devanagari':	return  'AFTER_SUB'
    401  if block == 'Gurmukhi':	return  'AFTER_POST' # Deviate from spec
    402  if block == 'Gujarati':	return  'AFTER_SUB'
    403  if block == 'Oriya':		return  'AFTER_MAIN'
    404  if block == 'Tamil':		return  'AFTER_SUB'
    405  if block == 'Telugu':		return  'BEFORE_SUB'
    406  if block == 'Kannada':	return  'BEFORE_SUB'
    407  return 'AFTER_SUB'
    408 def matra_pos_bottom(u, block):
    409  if block == 'Devanagari':	return  'AFTER_SUB'
    410  if block == 'Bengali':	return  'AFTER_SUB'
    411  if block == 'Gurmukhi':	return  'AFTER_POST'
    412  if block == 'Gujarati':	return  'AFTER_POST'
    413  if block == 'Oriya':		return  'AFTER_SUB'
    414  if block == 'Tamil':		return  'AFTER_POST'
    415  if block == 'Telugu':		return  'BEFORE_SUB'
    416  if block == 'Kannada':	return  'BEFORE_SUB'
    417  if block == 'Malayalam':	return  'AFTER_POST'
    418  return "AFTER_SUB"
    419 def indic_matra_position(u, pos, block): # Reposition matra
    420  if pos == 'PRE_C':	return matra_pos_left(u, block)
    421  if pos == 'POST_C':	return matra_pos_right(u, block)
    422  if pos == 'ABOVE_C':	return matra_pos_top(u, block)
    423  if pos == 'BELOW_C':	return matra_pos_bottom(u, block)
    424  assert (False)
    425 
    426 def position_to_category(pos):
    427  if pos == 'PRE_C':	return 'VPre'
    428  if pos == 'ABOVE_C':	return 'VAbv'
    429  if pos == 'BELOW_C':	return 'VBlw'
    430  if pos == 'POST_C':	return 'VPst'
    431  assert(False)
    432 
    433 
    434 defaults = (category_map[defaults[0]], position_map[defaults[1]], defaults[2])
    435 
    436 indic_data = {}
    437 for k, (cat, pos, block) in combined.items():
    438  cat = category_map[cat]
    439  if cat == 'SM' and pos == 'Not_Applicable':
    440    cat = 'SMPst'
    441  pos = position_map[pos]
    442  indic_data[k] = (cat, pos, block)
    443 
    444 for k,new_cat in category_overrides.items():
    445  (cat, pos, _) = indic_data.get(k, defaults)
    446  indic_data[k] = (new_cat, pos, unicode_data[2][k])
    447 
    448 # We only expect position for certain types
    449 positioned_categories = ('CM', 'SM', 'RS', 'H', 'M', 'MPst')
    450 for k, (cat, pos, block) in indic_data.items():
    451  if cat not in positioned_categories:
    452    pos = 'END'
    453    indic_data[k] = (cat, pos, block)
    454 
    455 # Position overrides are more complicated
    456 
    457 # Keep in sync with CONSONANT_FLAGS in the shaper
    458 consonant_categories = ('C', 'CS', 'Ra','CM', 'V', 'PLACEHOLDER', 'DOTTEDCIRCLE')
    459 matra_categories = ('M', 'MPst')
    460 smvd_categories = ('SM', 'SMPst', 'VD', 'A', 'Symbol')
    461 for k, (cat, pos, block) in indic_data.items():
    462  if cat in consonant_categories:
    463    pos = 'BASE_C'
    464  elif cat in matra_categories:
    465    if block.startswith('Khmer') or block.startswith('Myanmar'):
    466      cat = position_to_category(pos)
    467    else:
    468      pos = indic_matra_position(k, pos, block)
    469  elif cat in smvd_categories:
    470    pos = 'SMVD';
    471  indic_data[k] = (cat, pos, block)
    472 
    473 for k,new_pos in position_overrides.items():
    474  (cat, pos, _) = indic_data.get(k, defaults)
    475  indic_data[k] = (cat, new_pos, unicode_data[2][k])
    476 
    477 
    478 values = [{_: 1} for _ in defaults]
    479 for vv in indic_data.values():
    480  for i,v in enumerate(vv):
    481    values[i][v] = values[i].get (v, 0) + 1
    482 
    483 
    484 
    485 
    486 # Move the outliers NO-BREAK SPACE and DOTTED CIRCLE out
    487 singles = {}
    488 for u in ALLOWED_SINGLES:
    489 singles[u] = indic_data[u]
    490 del indic_data[u]
    491 
    492 print ("/* == Start of generated table == */")
    493 print ("/*")
    494 print (" * The following table is generated by running:")
    495 print (" *")
    496 print (" *   ./gen-indic-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt Blocks.txt")
    497 print (" *")
    498 print (" * on files with these headers:")
    499 print (" *")
    500 for h in headers:
    501 for l in h:
    502 	print (" * %s" % (l.strip()))
    503 print (" */")
    504 print ()
    505 print ('#include "hb.hh"')
    506 print ()
    507 print ('#ifndef HB_NO_OT_SHAPE')
    508 print ()
    509 print ('#include "hb-ot-shaper-indic.hh"')
    510 print ()
    511 print ('#pragma GCC diagnostic push')
    512 print ('#pragma GCC diagnostic ignored "-Wunused-macros"')
    513 print ()
    514 
    515 # Print categories
    516 for shaper in categories:
    517  print ('#include "hb-ot-shaper-%s-machine.hh"' % shaper)
    518 print ()
    519 done = {}
    520 for shaper, shaper_cats in categories.items():
    521  print ('/* %s */' % shaper)
    522  for cat in shaper_cats:
    523    v = shaper[0].upper()
    524    if cat not in done:
    525      print ("#define OT_%s %s_Cat(%s)" % (cat, v, cat))
    526      done[cat] = v
    527    else:
    528      print ('static_assert (OT_%s == %s_Cat(%s), "");' % (cat, v, cat))
    529 print ()
    530 
    531 # Shorten values
    532 short = [{
    533 "Repha":		'Rf',
    534 "PLACEHOLDER":		'GB',
    535 "DOTTEDCIRCLE":		'DC',
    536 "SMPst":		'SP',
    537 "VPst":			'VR',
    538 "VPre":			'VL',
    539 "Robatic":		'Rt',
    540 "Xgroup":		'Xg',
    541 "Ygroup":		'Yg',
    542 "As":			'As',
    543 },{
    544 "END":			'X',
    545 "BASE_C":		'C',
    546 "ABOVE_C":		'T',
    547 "BELOW_C":		'B',
    548 "POST_C":		'R',
    549 "PRE_C":		'L',
    550 "PRE_M":		'LM',
    551 "AFTER_MAIN":		'A',
    552 "AFTER_SUB":		'AS',
    553 "BEFORE_SUB":		'BS',
    554 "AFTER_POST":		'AP',
    555 "SMVD":			'SM',
    556 }]
    557 all_shorts = [{},{}]
    558 
    559 # Add some of the values, to make them more readable, and to avoid duplicates
    560 
    561 for i in range (2):
    562 for v,s in short[i].items ():
    563 	all_shorts[i][s] = v
    564 
    565 what = ["OT", "POS"]
    566 what_short = ["_OT", "_POS"]
    567 cat_defs = []
    568 for i in range (2):
    569 vv = sorted (values[i].keys ())
    570 for v in vv:
    571 	v_no_and = v.replace ('_And_', '_')
    572 	if v in short[i]:
    573 		s = short[i][v]
    574 	else:
    575 		s = ''.join ([c for c in v_no_and if ord ('A') <= ord (c) <= ord ('Z')])
    576 		if s in all_shorts[i]:
    577 			raise Exception ("Duplicate short value alias", v, all_shorts[i][s])
    578 		all_shorts[i][s] = v
    579 		short[i][v] = s
    580 	cat_defs.append ((what_short[i] + '_' + s, what[i] + '_' + (v.upper () if i else v), str (values[i][v]), v))
    581 
    582 maxlen_s = max ([len (c[0]) for c in cat_defs])
    583 maxlen_l = max ([len (c[1]) for c in cat_defs])
    584 maxlen_n = max ([len (c[2]) for c in cat_defs])
    585 for s in what_short:
    586 print ()
    587 for c in [c for c in cat_defs if s in c[0]]:
    588 	print ("#define %s %s /* %s chars; %s */" %
    589 		(c[0].ljust (maxlen_s), c[1].ljust (maxlen_l), c[2].rjust (maxlen_n), c[3]))
    590 print ()
    591 print ('#pragma GCC diagnostic pop')
    592 print ()
    593 print ("#define INDIC_COMBINE_CATEGORIES(S,M) ((S) | ((M) << 8))")
    594 print ()
    595 print ("#define _(S,M) INDIC_COMBINE_CATEGORIES (%s_##S, %s_##M)" % tuple(what_short))
    596 print ()
    597 print ()
    598 
    599 total = 0
    600 used = 0
    601 last_block = None
    602 def print_block (block, start, end, data):
    603 global total, used, last_block
    604 if block and block != last_block:
    605 	print ()
    606 	print ()
    607 	print ("  /* %s */" % block)
    608 num = 0
    609 assert start % 8 == 0
    610 assert (end+1) % 8 == 0
    611 for u in range (start, end+1):
    612 	if u % 8 == 0:
    613 		print ()
    614 		print ("  /* %04X */" % u, end="")
    615 	if u in data:
    616 		num += 1
    617 	d = data.get (u, defaults)
    618 	print ("%9s" % ("_(%s,%s)," % (short[0][d[0]], short[1][d[1]])), end="")
    619 
    620 total += end - start + 1
    621 used += num
    622 if block:
    623 	last_block = block
    624 
    625 uu = sorted (indic_data)
    626 
    627 last = -100000
    628 num = 0
    629 offset = 0
    630 starts = []
    631 ends = []
    632 print ("static const uint16_t indic_table[] = {")
    633 for u in uu:
    634 if u <= last:
    635 	continue
    636 block = indic_data[u][2]
    637 
    638 start = u//8*8
    639 end = start+1
    640 while end in uu and block == indic_data[end][2]:
    641 	end += 1
    642 end = (end-1)//8*8 + 7
    643 
    644 if start != last + 1:
    645 	if start - last <= 1+16*2:
    646 		print_block (None, last+1, start-1, indic_data)
    647 	else:
    648 		if last >= 0:
    649 			ends.append (last + 1)
    650 			offset += ends[-1] - starts[-1]
    651 		print ()
    652 		print ()
    653 		print ("#define indic_offset_0x%04xu %d" % (start, offset))
    654 		starts.append (start)
    655 
    656 print_block (block, start, end, indic_data)
    657 last = end
    658 ends.append (last + 1)
    659 offset += ends[-1] - starts[-1]
    660 print ()
    661 print ()
    662 occupancy = used * 100. / total
    663 page_bits = 12
    664 print ("}; /* Table items: %d; occupancy: %d%% */" % (offset, occupancy))
    665 print ()
    666 print ("uint16_t")
    667 print ("hb_indic_get_categories (hb_codepoint_t u)")
    668 print ("{")
    669 print ("  switch (u >> %d)" % page_bits)
    670 print ("  {")
    671 pages = set ([u>>page_bits for u in starts+ends+list (singles.keys ())])
    672 for p in sorted(pages):
    673 print ("    case 0x%0Xu:" % p)
    674 for u,d in singles.items ():
    675 	if p != u>>page_bits: continue
    676 	print ("      if (unlikely (u == 0x%04Xu)) return _(%s,%s);" % (u, short[0][d[0]], short[1][d[1]]))
    677 for (start,end) in zip (starts, ends):
    678 	if p not in [start>>page_bits, end>>page_bits]: continue
    679 	offset = "indic_offset_0x%04xu" % start
    680 	print ("      if (hb_in_range<hb_codepoint_t> (u, 0x%04Xu, 0x%04Xu)) return indic_table[u - 0x%04Xu + %s];" % (start, end-1, start, offset))
    681 print ("      break;")
    682 print ("")
    683 print ("    default:")
    684 print ("      break;")
    685 print ("  }")
    686 print ("  return _(X,X);")
    687 print ("}")
    688 print ()
    689 print ("#undef _")
    690 print ("#undef INDIC_COMBINE_CATEGORIES")
    691 for i in range (2):
    692 print ()
    693 vv = sorted (values[i].keys ())
    694 for v in vv:
    695 	print ("#undef %s_%s" %
    696 		(what_short[i], short[i][v]))
    697 print ()
    698 print ('#endif')
    699 print ()
    700 print ("/* == End of generated table == */")
    701 
    702 # Maintain at least 50% occupancy in the table */
    703 if occupancy < 50:
    704 raise Exception ("Table too sparse, please investigate: ", occupancy)