tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

gen-arabic-table.py (10092B)


      1 #!/usr/bin/env python3
      2 
      3 """usage: ./gen-arabic-table.py ArabicShaping.txt UnicodeData.txt Blocks.txt
      4 
      5 Input files:
      6 * https://unicode.org/Public/UCD/latest/ucd/ArabicShaping.txt
      7 * https://unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
      8 * https://unicode.org/Public/UCD/latest/ucd/Blocks.txt
      9 """
     10 
     11 import os.path, sys
     12 
     13 if len (sys.argv) != 4:
     14 sys.exit (__doc__)
     15 
     16 files = [open (x, encoding='utf-8') for x in sys.argv[1:]]
     17 
     18 headers = [[files[0].readline (), files[0].readline ()], [files[2].readline (), files[2].readline ()]]
     19 headers.append (["UnicodeData.txt does not have a header."])
     20 while files[0].readline ().find ('##################') < 0:
     21 pass
     22 
     23 blocks = {}
     24 def read_blocks(f):
     25 global blocks
     26 for line in f:
     27 
     28 	j = line.find ('#')
     29 	if j >= 0:
     30 		line = line[:j]
     31 
     32 	fields = [x.strip () for x in line.split (';')]
     33 	if len (fields) == 1:
     34 		continue
     35 
     36 	uu = fields[0].split ('..')
     37 	start = int (uu[0], 16)
     38 	if len (uu) == 1:
     39 		end = start
     40 	else:
     41 		end = int (uu[1], 16)
     42 
     43 	t = fields[1]
     44 
     45 	for u in range (start, end + 1):
     46 		blocks[u] = t
     47 
     48 def print_joining_table(f):
     49 
     50 values = {}
     51 for line in f:
     52 
     53 	if line[0] == '#':
     54 		continue
     55 
     56 	fields = [x.strip () for x in line.split (';')]
     57 	if len (fields) == 1:
     58 		continue
     59 
     60 	u = int (fields[0], 16)
     61 
     62 	if fields[3] in ["ALAPH", "DALATH RISH"]:
     63 		value = "JOINING_GROUP_" + fields[3].replace(' ', '_')
     64 	else:
     65 		value = "JOINING_TYPE_" + fields[2]
     66 	values[u] = value
     67 
     68 short_value = {}
     69 for value in sorted (set ([v for v in values.values ()] + ['JOINING_TYPE_X'])):
     70 	short = ''.join(x[0] for x in value.split('_')[2:])
     71 	assert short not in short_value.values()
     72 	short_value[value] = short
     73 
     74 print ()
     75 for value,short in short_value.items():
     76 	print ("#define %s	%s" % (short, value))
     77 
     78 uu = sorted(values.keys())
     79 num = len(values)
     80 all_blocks = set([blocks[u] for u in uu])
     81 
     82 last = -100000
     83 ranges = []
     84 for u in uu:
     85 	if u - last <= 1+16*5:
     86 		ranges[-1][-1] = u
     87 	else:
     88 		ranges.append([u,u])
     89 	last = u
     90 
     91 print ()
     92 print ("static const uint8_t joining_table[] =")
     93 print ("{")
     94 last_block = None
     95 offset = 0
     96 for start,end in ranges:
     97 
     98 	print ()
     99 	print ("#define joining_offset_0x%04xu %d" % (start, offset))
    100 
    101 	for u in range(start, end+1):
    102 
    103 		block = blocks.get(u, last_block)
    104 		value = values.get(u, "JOINING_TYPE_X")
    105 
    106 		if block != last_block or u == start:
    107 			if u != start:
    108 				print ()
    109 			if block in all_blocks:
    110 				print ("\n  /* %s */" % block)
    111 			else:
    112 				print ("\n  /* FILLER */")
    113 			last_block = block
    114 			if u % 32 != 0:
    115 				print ()
    116 				print ("  /* %04X */" % (u//32*32), "  " * (u % 32), end="")
    117 
    118 		if u % 32 == 0:
    119 			print ()
    120 			print ("  /* %04X */ " % u, end="")
    121 		print ("%s," % short_value[value], end="")
    122 	print ()
    123 
    124 	offset += end - start + 1
    125 print ()
    126 occupancy = num * 100. / offset
    127 print ("}; /* Table items: %d; occupancy: %d%% */" % (offset, occupancy))
    128 print ()
    129 
    130 page_bits = 12
    131 print ()
    132 print ("static unsigned int")
    133 print ("joining_type (hb_codepoint_t u)")
    134 print ("{")
    135 print ("  switch (u >> %d)" % page_bits)
    136 print ("  {")
    137 pages = set([u>>page_bits for u in [s for s,e in ranges]+[e for s,e in ranges]])
    138 for p in sorted(pages):
    139 	print ("    case 0x%0Xu:" % p)
    140 	for (start,end) in ranges:
    141 		if p not in [start>>page_bits, end>>page_bits]: continue
    142 		offset = "joining_offset_0x%04xu" % start
    143 		print ("      if (hb_in_range<hb_codepoint_t> (u, 0x%04Xu, 0x%04Xu)) return joining_table[u - 0x%04Xu + %s];" % (start, end, start, offset))
    144 	print ("      break;")
    145 	print ("")
    146 print ("    default:")
    147 print ("      break;")
    148 print ("  }")
    149 print ("  return X;")
    150 print ("}")
    151 print ()
    152 for value,short in short_value.items():
    153 	print ("#undef %s" % (short))
    154 print ()
    155 
    156 LIGATURES = (
    157 0xF2EE, 0xFC08, 0xFC0E, 0xFC12, 0xFC32, 0xFC3F, 0xFC40, 0xFC41, 0xFC42,
    158 0xFC44, 0xFC4E, 0xFC5E, 0xFC60, 0xFC61, 0xFC62, 0xFC6A, 0xFC6D, 0xFC6F,
    159 0xFC70, 0xFC73, 0xFC75, 0xFC86, 0xFC8F, 0xFC91, 0xFC94, 0xFC9C, 0xFC9D,
    160 0xFC9E, 0xFC9F, 0xFCA1, 0xFCA2, 0xFCA3, 0xFCA4, 0xFCA8, 0xFCAA, 0xFCAC,
    161 0xFCB0, 0xFCC9, 0xFCCA, 0xFCCB, 0xFCCC, 0xFCCD, 0xFCCE, 0xFCCF, 0xFCD0,
    162 0xFCD1, 0xFCD2, 0xFCD3, 0xFCD5, 0xFCDA, 0xFCDB, 0xFCDC, 0xFCDD, 0xFD30,
    163 0xFD88, 0xFEF5, 0xFEF6, 0xFEF7, 0xFEF8, 0xFEF9, 0xFEFA, 0xFEFB, 0xFEFC,
    164 0xF201, 0xF211, 0xF2EE,
    165 )
    166 
    167 def print_shaping_table(f):
    168 
    169 shapes = {}
    170 ligatures = {}
    171 names = {}
    172 lines = f.readlines()
    173 lines += [
    174 	"F201;PUA ARABIC LIGATURE LELLAH ISOLATED FORM;Lo;0;AL;<isolated> 0644 0644 0647;;;;N;;;;;",
    175 	"F211;PUA ARABIC LIGATURE LAM WITH MEEM WITH JEEM INITIAL FORM;Lo;0;AL;<initial> 0644 0645 062C;;;;N;;;;;",
    176 	"F2EE;PUA ARABIC LIGATURE SHADDA WITH FATHATAN ISOLATED FORM;Lo;0;AL;<isolated> 0020 064B 0651;;;;N;;;;;",
    177 ]
    178 for line in lines:
    179 
    180 	fields = [x.strip () for x in line.split (';')]
    181 	if fields[5][0:1] != '<':
    182 		continue
    183 
    184 	items = fields[5].split (' ')
    185 	shape, items = items[0][1:-1], tuple (int (x, 16) for x in items[1:])
    186 	c = int (fields[0], 16)
    187 
    188 	if not shape in ['initial', 'medial', 'isolated', 'final']:
    189 		continue
    190 
    191 	if len (items) != 1:
    192 		# Mark ligatures start with space and are in visual order, so we
    193 		# remove the space and reverse the items.
    194 		if items[0] == 0x0020:
    195 			items = items[:0:-1]
    196 			shape = None
    197 		# We only care about a subset of ligatures
    198 		if c not in LIGATURES:
    199 			continue
    200 
    201 		# Save ligature
    202 		names[c] = fields[1]
    203 		if items not in ligatures:
    204 			ligatures[items] = {}
    205 		ligatures[items][shape] = c
    206 	else:
    207 		# Save shape
    208 		if items[0] not in names:
    209 			names[items[0]] = fields[1]
    210 		else:
    211 			names[items[0]] = os.path.commonprefix ([names[items[0]], fields[1]]).strip ()
    212 		if items[0] not in shapes:
    213 			shapes[items[0]] = {}
    214 		shapes[items[0]][shape] = c
    215 
    216 print ()
    217 print ("static const uint16_t shaping_table[][4] =")
    218 print ("{")
    219 
    220 keys = shapes.keys ()
    221 min_u, max_u = min (keys), max (keys)
    222 for u in range (min_u, max_u + 1):
    223 	s = [shapes[u][shape] if u in shapes and shape in shapes[u] else 0
    224 	     for shape in  ['initial', 'medial', 'final', 'isolated']]
    225 	value = ', '.join ("0x%04Xu" % c for c in s)
    226 	print ("  {%s}, /* U+%04X %s */" % (value, u, names[u] if u in names else ""))
    227 
    228 print ("};")
    229 print ()
    230 print ("#define SHAPING_TABLE_FIRST	0x%04Xu" % min_u)
    231 print ("#define SHAPING_TABLE_LAST	0x%04Xu" % max_u)
    232 print ()
    233 
    234 ligas_2 = {}
    235 ligas_3 = {}
    236 ligas_mark_2 = {}
    237 for key in ligatures.keys ():
    238 	for shape in ligatures[key]:
    239 		c = ligatures[key][shape]
    240 		if len(key) == 3:
    241 			if shape == 'isolated':
    242 				liga = (shapes[key[0]]['initial'], shapes[key[1]]['medial'], shapes[key[2]]['final'])
    243 			elif shape == 'final':
    244 				liga = (shapes[key[0]]['medial'], shapes[key[1]]['medial'], shapes[key[2]]['final'])
    245 			elif shape == 'initial':
    246 				liga = (shapes[key[0]]['initial'], shapes[key[1]]['medial'], shapes[key[2]]['medial'])
    247 			else:
    248 				raise Exception ("Unexpected shape", shape)
    249 			if liga[0] not in ligas_3:
    250 				ligas_3[liga[0]] = []
    251 			ligas_3[liga[0]].append ((liga[1], liga[2], c))
    252 		elif len(key) == 2:
    253 			if shape is None:
    254 				liga = key
    255 				if liga[0] not in ligas_mark_2:
    256 					ligas_mark_2[liga[0]] = []
    257 				ligas_mark_2[liga[0]].append ((liga[1], c))
    258 				continue
    259 			elif shape == 'isolated':
    260 				liga = (shapes[key[0]]['initial'], shapes[key[1]]['final'])
    261 			elif shape == 'final':
    262 				liga = (shapes[key[0]]['medial'], shapes[key[1]]['final'])
    263 			elif shape == 'initial':
    264 				liga = (shapes[key[0]]['initial'], shapes[key[1]]['medial'])
    265 			else:
    266 				raise Exception ("Unexpected shape", shape)
    267 			if liga[0] not in ligas_2:
    268 				ligas_2[liga[0]] = []
    269 			ligas_2[liga[0]].append ((liga[1], c))
    270 		else:
    271 			raise Exception ("Unexpected number of ligature components", key)
    272 max_i = max (len (ligas_2[l]) for l in ligas_2)
    273 print ()
    274 print ("static const struct ligature_set_t {")
    275 print (" uint16_t first;")
    276 print (" struct ligature_pairs_t {")
    277 print ("   uint16_t components[1];")
    278 print ("   uint16_t ligature;")
    279 print (" } ligatures[%d];" % max_i)
    280 print ("} ligature_table[] =")
    281 print ("{")
    282 for first in sorted (ligas_2.keys ()):
    283 
    284 	print ("  { 0x%04Xu, {" % (first))
    285 	for liga in ligas_2[first]:
    286 		print ("    { {0x%04Xu}, 0x%04Xu }, /* %s */" % (liga[0], liga[1], names[liga[1]]))
    287 	print ("  }},")
    288 
    289 print ("};")
    290 print ()
    291 
    292 max_i = max (len (ligas_mark_2[l]) for l in ligas_mark_2)
    293 print ()
    294 print ("static const struct ligature_mark_set_t {")
    295 print (" uint16_t first;")
    296 print (" struct ligature_pairs_t {")
    297 print ("   uint16_t components[1];")
    298 print ("   uint16_t ligature;")
    299 print (" } ligatures[%d];" % max_i)
    300 print ("} ligature_mark_table[] =")
    301 print ("{")
    302 for first in sorted (ligas_mark_2.keys ()):
    303 
    304 	print ("  { 0x%04Xu, {" % (first))
    305 	for liga in ligas_mark_2[first]:
    306 		print ("    { {0x%04Xu}, 0x%04Xu }, /* %s */" % (liga[0], liga[1], names[liga[1]]))
    307 	print ("  }},")
    308 
    309 print ("};")
    310 print ()
    311 
    312 max_i = max (len (ligas_3[l]) for l in ligas_3)
    313 print ()
    314 print ("static const struct ligature_3_set_t {")
    315 print (" uint16_t first;")
    316 print (" struct ligature_triplets_t {")
    317 print ("   uint16_t components[2];")
    318 print ("   uint16_t ligature;")
    319 print (" } ligatures[%d];" % max_i)
    320 print ("} ligature_3_table[] =")
    321 print ("{")
    322 for first in sorted (ligas_3.keys ()):
    323 
    324 	print ("  { 0x%04Xu, {" % (first))
    325 	for liga in ligas_3[first]:
    326 		print ("    { {0x%04Xu, 0x%04Xu}, 0x%04Xu}, /* %s */" % (liga[0], liga[1], liga[2], names[liga[2]]))
    327 	print ("  }},")
    328 
    329 print ("};")
    330 print ()
    331 
    332 
    333 
    334 print ("/* == Start of generated table == */")
    335 print ("/*")
    336 print (" * The following table is generated by running:")
    337 print (" *")
    338 print (" *   ./gen-arabic-table.py ArabicShaping.txt UnicodeData.txt Blocks.txt")
    339 print (" *")
    340 print (" * on files with these headers:")
    341 print (" *")
    342 for h in headers:
    343 for l in h:
    344 	print (" * %s" % (l.strip()))
    345 print (" */")
    346 print ()
    347 print ("#ifndef HB_OT_SHAPER_ARABIC_TABLE_HH")
    348 print ("#define HB_OT_SHAPER_ARABIC_TABLE_HH")
    349 print ()
    350 
    351 read_blocks (files[2])
    352 print_joining_table (files[0])
    353 print_shaping_table (files[1])
    354 
    355 print ()
    356 print ("#endif /* HB_OT_SHAPER_ARABIC_TABLE_HH */")
    357 print ()
    358 print ("/* == End of generated table == */")